1 /*
   2  * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "gc/shared/barrierSet.hpp"
  28 #include "gc/shared/barrierSetAssembler.hpp"
  29 #include "oops/methodData.hpp"
  30 #include "opto/c2_MacroAssembler.hpp"
  31 #include "opto/intrinsicnode.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/opcodes.hpp"
  34 #include "opto/subnode.hpp"
  35 #include "runtime/globals.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 #include "utilities/checkedCast.hpp"
  39 #include "utilities/globalDefinitions.hpp"
  40 #include "utilities/powerOfTwo.hpp"
  41 #include "utilities/sizes.hpp"
  42 
  43 #ifdef PRODUCT
  44 #define BLOCK_COMMENT(str) /* nothing */
  45 #define STOP(error) stop(error)
  46 #else
  47 #define BLOCK_COMMENT(str) block_comment(str)
  48 #define STOP(error) block_comment(error); stop(error)
  49 #endif
  50 
  51 // C2 compiled method's prolog code.
  52 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) {
  53   if (C->clinit_barrier_on_entry()) {
  54     assert(VM_Version::supports_fast_class_init_checks(), "sanity");
  55     assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started");
  56 
  57     Label L_skip_barrier;
  58     Register klass = rscratch1;
  59 
  60     mov_metadata(klass, C->method()->holder()->constant_encoding());
  61     clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
  62 
  63     jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
  64 
  65     bind(L_skip_barrier);
  66   }
  67 
  68   int framesize = C->output()->frame_size_in_bytes();
  69   int bangsize = C->output()->bang_size_in_bytes();
  70   bool fp_mode_24b = false;
  71   int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0;
  72 
  73   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  74   // NativeJump::patch_verified_entry will be able to patch out the entry
  75   // code safely. The push to verify stack depth is ok at 5 bytes,
  76   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  77   // stack bang then we must use the 6 byte frame allocation even if
  78   // we have no frame. :-(
  79   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  80 
  81   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  82   // Remove word for return addr
  83   framesize -= wordSize;
  84   stack_bang_size -= wordSize;
  85 
  86   // Calls to C2R adapters often do not accept exceptional returns.
  87   // We require that their callers must bang for them.  But be careful, because
  88   // some VM calls (such as call site linkage) can use several kilobytes of
  89   // stack.  But the stack safety zone should account for that.
  90   // See bugs 4446381, 4468289, 4497237.
  91   if (stack_bang_size > 0) {
  92     generate_stack_overflow_check(stack_bang_size);
  93 
  94     // We always push rbp, so that on return to interpreter rbp, will be
  95     // restored correctly and we can correct the stack.
  96     push(rbp);
  97     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  98     if (PreserveFramePointer) {
  99       mov(rbp, rsp);
 100     }
 101     // Remove word for ebp
 102     framesize -= wordSize;
 103 
 104     // Create frame
 105     if (framesize) {
 106       subptr(rsp, framesize);
 107     }
 108   } else {
 109     // Create frame (force generation of a 4 byte immediate value)
 110     subptr_imm32(rsp, framesize);
 111 
 112     // Save RBP register now.
 113     framesize -= wordSize;
 114     movptr(Address(rsp, framesize), rbp);
 115     // Save caller's stack pointer into RBP if the frame pointer is preserved.
 116     if (PreserveFramePointer) {
 117       movptr(rbp, rsp);
 118       if (framesize > 0) {
 119         addptr(rbp, framesize);
 120       }
 121     }
 122   }
 123 
 124   if (C->needs_stack_repair()) {
 125     // Save stack increment just below the saved rbp (also account for fixed framesize and rbp)
 126     assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned");
 127     movptr(Address(rsp, framesize - wordSize), sp_inc + framesize + wordSize);
 128   }
 129 
 130   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 131     framesize -= wordSize;
 132     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 133   }
 134 
 135 #ifdef ASSERT
 136   if (VerifyStackAtCalls) {
 137     Label L;
 138     push(rax);
 139     mov(rax, rsp);
 140     andptr(rax, StackAlignmentInBytes-1);
 141     cmpptr(rax, StackAlignmentInBytes-wordSize);
 142     pop(rax);
 143     jcc(Assembler::equal, L);
 144     STOP("Stack is not properly aligned!");
 145     bind(L);
 146   }
 147 #endif
 148 }
 149 
 150 void C2_MacroAssembler::entry_barrier() {
 151   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 152   // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 153   Label dummy_slow_path;
 154   Label dummy_continuation;
 155   Label* slow_path = &dummy_slow_path;
 156   Label* continuation = &dummy_continuation;
 157   if (!Compile::current()->output()->in_scratch_emit_size()) {
 158     // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 159     C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 160     Compile::current()->output()->add_stub(stub);
 161     slow_path = &stub->entry();
 162     continuation = &stub->continuation();
 163   }
 164   bs->nmethod_entry_barrier(this, slow_path, continuation);
 165 }
 166 
 167 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 168   switch (vlen_in_bytes) {
 169     case  4: // fall-through
 170     case  8: // fall-through
 171     case 16: return Assembler::AVX_128bit;
 172     case 32: return Assembler::AVX_256bit;
 173     case 64: return Assembler::AVX_512bit;
 174 
 175     default: {
 176       ShouldNotReachHere();
 177       return Assembler::AVX_NoVec;
 178     }
 179   }
 180 }
 181 
 182 // fast_lock and fast_unlock used by C2
 183 
 184 // Because the transitions from emitted code to the runtime
 185 // monitorenter/exit helper stubs are so slow it's critical that
 186 // we inline both the stack-locking fast path and the inflated fast path.
 187 //
 188 // See also: cmpFastLock and cmpFastUnlock.
 189 //
 190 // What follows is a specialized inline transliteration of the code
 191 // in enter() and exit(). If we're concerned about I$ bloat another
 192 // option would be to emit TrySlowEnter and TrySlowExit methods
 193 // at startup-time.  These methods would accept arguments as
 194 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 195 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 196 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 197 // In practice, however, the # of lock sites is bounded and is usually small.
 198 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 199 // if the processor uses simple bimodal branch predictors keyed by EIP
 200 // Since the helper routines would be called from multiple synchronization
 201 // sites.
 202 //
 203 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 204 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 205 // to those specialized methods.  That'd give us a mostly platform-independent
 206 // implementation that the JITs could optimize and inline at their pleasure.
 207 // Done correctly, the only time we'd need to cross to native could would be
 208 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 209 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 210 // (b) explicit barriers or fence operations.
 211 //
 212 // TODO:
 213 //
 214 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 215 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 216 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 217 //    the lock operators would typically be faster than reifying Self.
 218 //
 219 // *  Ideally I'd define the primitives as:
 220 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 221 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 222 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 223 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 224 //    Furthermore the register assignments are overconstrained, possibly resulting in
 225 //    sub-optimal code near the synchronization site.
 226 //
 227 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 228 //    Alternately, use a better sp-proximity test.
 229 //
 230 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 231 //    Either one is sufficient to uniquely identify a thread.
 232 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 233 //
 234 // *  Intrinsify notify() and notifyAll() for the common cases where the
 235 //    object is locked by the calling thread but the waitlist is empty.
 236 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 237 //
 238 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 239 //    But beware of excessive branch density on AMD Opterons.
 240 //
 241 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 242 //    or failure of the fast path.  If the fast path fails then we pass
 243 //    control to the slow path, typically in C.  In fast_lock and
 244 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 245 //    will emit a conditional branch immediately after the node.
 246 //    So we have branches to branches and lots of ICC.ZF games.
 247 //    Instead, it might be better to have C2 pass a "FailureLabel"
 248 //    into fast_lock and fast_unlock.  In the case of success, control
 249 //    will drop through the node.  ICC.ZF is undefined at exit.
 250 //    In the case of failure, the node will branch directly to the
 251 //    FailureLabel
 252 
 253 
 254 // obj: object to lock
 255 // box: on-stack box address (displaced header location) - KILLED
 256 // rax,: tmp -- KILLED
 257 // scr: tmp -- KILLED
 258 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 259                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 260                                  Metadata* method_data) {
 261   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 262   // Ensure the register assignments are disjoint
 263   assert(tmpReg == rax, "");
 264   assert(cx1Reg == noreg, "");
 265   assert(cx2Reg == noreg, "");
 266   assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 267 
 268   // Possible cases that we'll encounter in fast_lock
 269   // ------------------------------------------------
 270   // * Inflated
 271   //    -- unlocked
 272   //    -- Locked
 273   //       = by self
 274   //       = by other
 275   // * neutral
 276   // * stack-locked
 277   //    -- by self
 278   //       = sp-proximity test hits
 279   //       = sp-proximity test generates false-negative
 280   //    -- by other
 281   //
 282 
 283   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 284 
 285   if (DiagnoseSyncOnValueBasedClasses != 0) {
 286     load_klass(tmpReg, objReg, scrReg);
 287     testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 288     jcc(Assembler::notZero, DONE_LABEL);
 289   }
 290 
 291   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 292   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 293   jcc(Assembler::notZero, IsInflated);
 294 
 295   if (LockingMode == LM_MONITOR) {
 296     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 297     testptr(objReg, objReg);
 298   } else {
 299     assert(LockingMode == LM_LEGACY, "must be");
 300     // Attempt stack-locking ...
 301     orptr (tmpReg, markWord::unlocked_value);
 302     if (EnableValhalla) {
 303       // Mask inline_type bit such that we go to the slow path if object is an inline type
 304       andptr(tmpReg, ~((int) markWord::inline_type_bit_in_place));
 305     }
 306     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 307     lock();
 308     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 309     jcc(Assembler::equal, COUNT);           // Success
 310 
 311     // Recursive locking.
 312     // The object is stack-locked: markword contains stack pointer to BasicLock.
 313     // Locked by current thread if difference with current SP is less than one page.
 314     subptr(tmpReg, rsp);
 315     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 316     andptr(tmpReg, (int32_t) (7 - (int)os::vm_page_size()) );
 317     movptr(Address(boxReg, 0), tmpReg);
 318   }
 319   jmp(DONE_LABEL);
 320 
 321   bind(IsInflated);
 322   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 323 
 324   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 325   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 326   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 327 
 328   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 329   movptr(boxReg, Address(r15_thread, JavaThread::monitor_owner_id_offset()));
 330   movq(scrReg, tmpReg);
 331   xorq(tmpReg, tmpReg);
 332   lock();
 333   cmpxchgptr(boxReg, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 334 
 335   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 336   jccb(Assembler::equal, COUNT);    // CAS above succeeded; propagate ZF = 1 (success)
 337 
 338   cmpptr(boxReg, rax);                // Check if we are already the owner (recursive lock)
 339   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 340   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 341   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 342   bind(DONE_LABEL);
 343 
 344   // ZFlag == 1 count in fast path
 345   // ZFlag == 0 count in slow path
 346   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 347 
 348   bind(COUNT);
 349   if (LockingMode == LM_LEGACY) {
 350     // Count monitors in fast path
 351     increment(Address(thread, JavaThread::held_monitor_count_offset()));
 352   }
 353   xorl(tmpReg, tmpReg); // Set ZF == 1
 354 
 355   bind(NO_COUNT);
 356 
 357   // At NO_COUNT the icc ZFlag is set as follows ...
 358   // fast_unlock uses the same protocol.
 359   // ZFlag == 1 -> Success
 360   // ZFlag == 0 -> Failure - force control through the slow path
 361 }
 362 
 363 // obj: object to unlock
 364 // box: box address (displaced header location), killed.  Must be EAX.
 365 // tmp: killed, cannot be obj nor box.
 366 //
 367 // Some commentary on balanced locking:
 368 //
 369 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 370 // Methods that don't have provably balanced locking are forced to run in the
 371 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 372 // The interpreter provides two properties:
 373 // I1:  At return-time the interpreter automatically and quietly unlocks any
 374 //      objects acquired the current activation (frame).  Recall that the
 375 //      interpreter maintains an on-stack list of locks currently held by
 376 //      a frame.
 377 // I2:  If a method attempts to unlock an object that is not held by the
 378 //      the frame the interpreter throws IMSX.
 379 //
 380 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 381 // B() doesn't have provably balanced locking so it runs in the interpreter.
 382 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 383 // is still locked by A().
 384 //
 385 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 386 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 387 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 388 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 389 // Arguably given that the spec legislates the JNI case as undefined our implementation
 390 // could reasonably *avoid* checking owner in fast_unlock().
 391 // In the interest of performance we elide m->Owner==Self check in unlock.
 392 // A perfectly viable alternative is to elide the owner check except when
 393 // Xcheck:jni is enabled.
 394 
 395 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) {
 396   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 397   assert(boxReg == rax, "");
 398   assert_different_registers(objReg, boxReg, tmpReg);
 399 
 400   Label DONE_LABEL, Stacked, COUNT, NO_COUNT;
 401 
 402   if (LockingMode == LM_LEGACY) {
 403     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 404     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 405   }
 406   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 407   if (LockingMode != LM_MONITOR) {
 408     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 409     jcc(Assembler::zero, Stacked);
 410   }
 411 
 412   // It's inflated.
 413 
 414   // Despite our balanced locking property we still check that m->_owner == Self
 415   // as java routines or native JNI code called by this thread might
 416   // have released the lock.
 417   //
 418   // If there's no contention try a 1-0 exit.  That is, exit without
 419   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 420   // we detect and recover from the race that the 1-0 exit admits.
 421   //
 422   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 423   // before it STs null into _owner, releasing the lock.  Updates
 424   // to data protected by the critical section must be visible before
 425   // we drop the lock (and thus before any other thread could acquire
 426   // the lock and observe the fields protected by the lock).
 427   // IA32's memory-model is SPO, so STs are ordered with respect to
 428   // each other and there's no need for an explicit barrier (fence).
 429   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 430   Label LSuccess, LNotRecursive;
 431 
 432   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 433   jccb(Assembler::equal, LNotRecursive);
 434 
 435   // Recursive inflated unlock
 436   decrement(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 437   jmpb(LSuccess);
 438 
 439   bind(LNotRecursive);
 440 
 441   // Set owner to null.
 442   // Release to satisfy the JMM
 443   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 444   // We need a full fence after clearing owner to avoid stranding.
 445   // StoreLoad achieves this.
 446   membar(StoreLoad);
 447 
 448   // Check if the entry_list is empty.
 449   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(entry_list)), NULL_WORD);
 450   jccb(Assembler::zero, LSuccess);    // If so we are done.
 451 
 452   // Check if there is a successor.
 453   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 454   jccb(Assembler::notZero, LSuccess); // If so we are done.
 455 
 456   // Save the monitor pointer in the current thread, so we can try to
 457   // reacquire the lock in SharedRuntime::monitor_exit_helper().
 458   andptr(tmpReg, ~(int32_t)markWord::monitor_value);
 459   movptr(Address(r15_thread, JavaThread::unlocked_inflated_monitor_offset()), tmpReg);
 460 
 461   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 462   jmpb  (DONE_LABEL);
 463 
 464   bind  (LSuccess);
 465   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 466   jmpb  (DONE_LABEL);
 467 
 468   if (LockingMode == LM_LEGACY) {
 469     bind  (Stacked);
 470     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 471     lock();
 472     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 473     // Intentional fall-thru into DONE_LABEL
 474   }
 475 
 476   bind(DONE_LABEL);
 477 
 478   // ZFlag == 1 count in fast path
 479   // ZFlag == 0 count in slow path
 480   jccb(Assembler::notZero, NO_COUNT);
 481 
 482   bind(COUNT);
 483 
 484   if (LockingMode == LM_LEGACY) {
 485     // Count monitors in fast path
 486     decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 487   }
 488 
 489   xorl(tmpReg, tmpReg); // Set ZF == 1
 490 
 491   bind(NO_COUNT);
 492 }
 493 
 494 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
 495                                               Register t, Register thread) {
 496   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 497   assert(rax_reg == rax, "Used for CAS");
 498   assert_different_registers(obj, box, rax_reg, t, thread);
 499 
 500   // Handle inflated monitor.
 501   Label inflated;
 502   // Finish fast lock successfully. ZF value is irrelevant.
 503   Label locked;
 504   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 505   Label slow_path;
 506 
 507   if (UseObjectMonitorTable) {
 508     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 509     movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
 510   }
 511 
 512   if (DiagnoseSyncOnValueBasedClasses != 0) {
 513     load_klass(rax_reg, obj, t);
 514     testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 515     jcc(Assembler::notZero, slow_path);
 516   }
 517 
 518   const Register mark = t;
 519 
 520   { // Lightweight Lock
 521 
 522     Label push;
 523 
 524     const Register top = UseObjectMonitorTable ? rax_reg : box;
 525 
 526     // Load the mark.
 527     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 528 
 529     // Prefetch top.
 530     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 531 
 532     // Check for monitor (0b10).
 533     testptr(mark, markWord::monitor_value);
 534     jcc(Assembler::notZero, inflated);
 535 
 536     // Check if lock-stack is full.
 537     cmpl(top, LockStack::end_offset() - 1);
 538     jcc(Assembler::greater, slow_path);
 539 
 540     // Check if recursive.
 541     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 542     jccb(Assembler::equal, push);
 543 
 544     // Try to lock. Transition lock bits 0b01 => 0b00
 545     movptr(rax_reg, mark);
 546     orptr(rax_reg, markWord::unlocked_value);
 547     andptr(mark, ~(int32_t)markWord::unlocked_value);
 548     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 549     jcc(Assembler::notEqual, slow_path);
 550 
 551     if (UseObjectMonitorTable) {
 552       // Need to reload top, clobbered by CAS.
 553       movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 554     }
 555     bind(push);
 556     // After successful lock, push object on lock-stack.
 557     movptr(Address(thread, top), obj);
 558     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 559     jmpb(locked);
 560   }
 561 
 562   { // Handle inflated monitor.
 563     bind(inflated);
 564 
 565     const Register monitor = t;
 566 
 567     if (!UseObjectMonitorTable) {
 568       assert(mark == monitor, "should be the same here");
 569     } else {
 570       // Uses ObjectMonitorTable.  Look for the monitor in the om_cache.
 571       // Fetch ObjectMonitor* from the cache or take the slow-path.
 572       Label monitor_found;
 573 
 574       // Load cache address
 575       lea(t, Address(thread, JavaThread::om_cache_oops_offset()));
 576 
 577       const int num_unrolled = 2;
 578       for (int i = 0; i < num_unrolled; i++) {
 579         cmpptr(obj, Address(t));
 580         jccb(Assembler::equal, monitor_found);
 581         increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 582       }
 583 
 584       Label loop;
 585 
 586       // Search for obj in cache.
 587       bind(loop);
 588 
 589       // Check for match.
 590       cmpptr(obj, Address(t));
 591       jccb(Assembler::equal, monitor_found);
 592 
 593       // Search until null encountered, guaranteed _null_sentinel at end.
 594       cmpptr(Address(t), 1);
 595       jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0
 596       increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 597       jmpb(loop);
 598 
 599       // Cache hit.
 600       bind(monitor_found);
 601       movptr(monitor, Address(t, OMCache::oop_to_monitor_difference()));
 602     }
 603     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 604     const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 605     const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
 606 
 607     Label monitor_locked;
 608     // Lock the monitor.
 609 
 610     if (UseObjectMonitorTable) {
 611       // Cache the monitor for unlock before trashing box. On failure to acquire
 612       // the lock, the slow path will reset the entry accordingly (see CacheSetter).
 613       movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
 614     }
 615 
 616     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 617     xorptr(rax_reg, rax_reg);
 618     movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
 619     lock(); cmpxchgptr(box, owner_address);
 620     jccb(Assembler::equal, monitor_locked);
 621 
 622     // Check if recursive.
 623     cmpptr(box, rax_reg);
 624     jccb(Assembler::notEqual, slow_path);
 625 
 626     // Recursive.
 627     increment(recursions_address);
 628 
 629     bind(monitor_locked);
 630   }
 631 
 632   bind(locked);
 633   // Set ZF = 1
 634   xorl(rax_reg, rax_reg);
 635 
 636 #ifdef ASSERT
 637   // Check that locked label is reached with ZF set.
 638   Label zf_correct;
 639   Label zf_bad_zero;
 640   jcc(Assembler::zero, zf_correct);
 641   jmp(zf_bad_zero);
 642 #endif
 643 
 644   bind(slow_path);
 645 #ifdef ASSERT
 646   // Check that slow_path label is reached with ZF not set.
 647   jcc(Assembler::notZero, zf_correct);
 648   stop("Fast Lock ZF != 0");
 649   bind(zf_bad_zero);
 650   stop("Fast Lock ZF != 1");
 651   bind(zf_correct);
 652 #endif
 653   // C2 uses the value of ZF to determine the continuation.
 654 }
 655 
 656 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) {
 657   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 658   assert(reg_rax == rax, "Used for CAS");
 659   assert_different_registers(obj, reg_rax, t);
 660 
 661   // Handle inflated monitor.
 662   Label inflated, inflated_check_lock_stack;
 663   // Finish fast unlock successfully.  MUST jump with ZF == 1
 664   Label unlocked, slow_path;
 665 
 666   const Register mark = t;
 667   const Register monitor = t;
 668   const Register top = UseObjectMonitorTable ? t : reg_rax;
 669   const Register box = reg_rax;
 670 
 671   Label dummy;
 672   C2FastUnlockLightweightStub* stub = nullptr;
 673 
 674   if (!Compile::current()->output()->in_scratch_emit_size()) {
 675     stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread);
 676     Compile::current()->output()->add_stub(stub);
 677   }
 678 
 679   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 680 
 681   { // Lightweight Unlock
 682 
 683     // Load top.
 684     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 685 
 686     if (!UseObjectMonitorTable) {
 687       // Prefetch mark.
 688       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 689     }
 690 
 691     // Check if obj is top of lock-stack.
 692     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 693     // Top of lock stack was not obj. Must be monitor.
 694     jcc(Assembler::notEqual, inflated_check_lock_stack);
 695 
 696     // Pop lock-stack.
 697     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 698     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 699 
 700     // Check if recursive.
 701     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 702     jcc(Assembler::equal, unlocked);
 703 
 704     // We elide the monitor check, let the CAS fail instead.
 705 
 706     if (UseObjectMonitorTable) {
 707       // Load mark.
 708       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 709     }
 710 
 711     // Try to unlock. Transition lock bits 0b00 => 0b01
 712     movptr(reg_rax, mark);
 713     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
 714     orptr(mark, markWord::unlocked_value);
 715     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 716     jcc(Assembler::notEqual, push_and_slow_path);
 717     jmp(unlocked);
 718   }
 719 
 720 
 721   { // Handle inflated monitor.
 722     bind(inflated_check_lock_stack);
 723 #ifdef ASSERT
 724     Label check_done;
 725     subl(top, oopSize);
 726     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
 727     jcc(Assembler::below, check_done);
 728     cmpptr(obj, Address(thread, top));
 729     jccb(Assembler::notEqual, inflated_check_lock_stack);
 730     stop("Fast Unlock lock on stack");
 731     bind(check_done);
 732     if (UseObjectMonitorTable) {
 733       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 734     }
 735     testptr(mark, markWord::monitor_value);
 736     jccb(Assembler::notZero, inflated);
 737     stop("Fast Unlock not monitor");
 738 #endif
 739 
 740     bind(inflated);
 741 
 742     if (!UseObjectMonitorTable) {
 743       assert(mark == monitor, "should be the same here");
 744     } else {
 745       // Uses ObjectMonitorTable.  Look for the monitor in our BasicLock on the stack.
 746       movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 747       // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
 748       cmpptr(monitor, alignof(ObjectMonitor*));
 749       jcc(Assembler::below, slow_path);
 750     }
 751     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 752     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
 753     const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
 754     const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag};
 755     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
 756 
 757     Label recursive;
 758 
 759     // Check if recursive.
 760     cmpptr(recursions_address, 0);
 761     jccb(Assembler::notZero, recursive);
 762 
 763     // Set owner to null.
 764     // Release to satisfy the JMM
 765     movptr(owner_address, NULL_WORD);
 766     // We need a full fence after clearing owner to avoid stranding.
 767     // StoreLoad achieves this.
 768     membar(StoreLoad);
 769 
 770     // Check if the entry_list is empty.
 771     cmpptr(entry_list_address, NULL_WORD);
 772     jccb(Assembler::zero, unlocked);    // If so we are done.
 773 
 774     // Check if there is a successor.
 775     cmpptr(succ_address, NULL_WORD);
 776     jccb(Assembler::notZero, unlocked); // If so we are done.
 777 
 778     // Save the monitor pointer in the current thread, so we can try to
 779     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 780     if (!UseObjectMonitorTable) {
 781       andptr(monitor, ~(int32_t)markWord::monitor_value);
 782     }
 783     movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
 784 
 785     orl(t, 1); // Fast Unlock ZF = 0
 786     jmpb(slow_path);
 787 
 788     // Recursive unlock.
 789     bind(recursive);
 790     decrement(recursions_address);
 791   }
 792 
 793   bind(unlocked);
 794   xorl(t, t); // Fast Unlock ZF = 1
 795 
 796 #ifdef ASSERT
 797   // Check that unlocked label is reached with ZF set.
 798   Label zf_correct;
 799   Label zf_bad_zero;
 800   jcc(Assembler::zero, zf_correct);
 801   jmp(zf_bad_zero);
 802 #endif
 803 
 804   bind(slow_path);
 805   if (stub != nullptr) {
 806     bind(stub->slow_path_continuation());
 807   }
 808 #ifdef ASSERT
 809   // Check that stub->continuation() label is reached with ZF not set.
 810   jcc(Assembler::notZero, zf_correct);
 811   stop("Fast Unlock ZF != 0");
 812   bind(zf_bad_zero);
 813   stop("Fast Unlock ZF != 1");
 814   bind(zf_correct);
 815 #endif
 816   // C2 uses the value of ZF to determine the continuation.
 817 }
 818 
 819 //-------------------------------------------------------------------------------------------
 820 // Generic instructions support for use in .ad files C2 code generation
 821 
 822 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 823   if (dst != src) {
 824     movdqu(dst, src);
 825   }
 826   if (opcode == Op_AbsVD) {
 827     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 828   } else {
 829     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 830     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 831   }
 832 }
 833 
 834 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 835   if (opcode == Op_AbsVD) {
 836     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 837   } else {
 838     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 839     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 840   }
 841 }
 842 
 843 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 844   if (dst != src) {
 845     movdqu(dst, src);
 846   }
 847   if (opcode == Op_AbsVF) {
 848     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 849   } else {
 850     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 851     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 852   }
 853 }
 854 
 855 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 856   if (opcode == Op_AbsVF) {
 857     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 858   } else {
 859     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 860     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 861   }
 862 }
 863 
 864 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 865   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 866   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 867 
 868   if (opcode == Op_MinV) {
 869     if (elem_bt == T_BYTE) {
 870       pminsb(dst, src);
 871     } else if (elem_bt == T_SHORT) {
 872       pminsw(dst, src);
 873     } else if (elem_bt == T_INT) {
 874       pminsd(dst, src);
 875     } else {
 876       assert(elem_bt == T_LONG, "required");
 877       assert(tmp == xmm0, "required");
 878       assert_different_registers(dst, src, tmp);
 879       movdqu(xmm0, dst);
 880       pcmpgtq(xmm0, src);
 881       blendvpd(dst, src);  // xmm0 as mask
 882     }
 883   } else { // opcode == Op_MaxV
 884     if (elem_bt == T_BYTE) {
 885       pmaxsb(dst, src);
 886     } else if (elem_bt == T_SHORT) {
 887       pmaxsw(dst, src);
 888     } else if (elem_bt == T_INT) {
 889       pmaxsd(dst, src);
 890     } else {
 891       assert(elem_bt == T_LONG, "required");
 892       assert(tmp == xmm0, "required");
 893       assert_different_registers(dst, src, tmp);
 894       movdqu(xmm0, src);
 895       pcmpgtq(xmm0, dst);
 896       blendvpd(dst, src);  // xmm0 as mask
 897     }
 898   }
 899 }
 900 
 901 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 902                                   XMMRegister src1, Address src2, int vlen_enc) {
 903   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 904   if (opcode == Op_UMinV) {
 905     switch(elem_bt) {
 906       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 907       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 908       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 909       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 910       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 911     }
 912   } else {
 913     assert(opcode == Op_UMaxV, "required");
 914     switch(elem_bt) {
 915       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 916       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 917       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 918       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 919       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 920     }
 921   }
 922 }
 923 
 924 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
 925   // For optimality, leverage a full vector width of 512 bits
 926   // for operations over smaller vector sizes on AVX512 targets.
 927   if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 928     if (opcode == Op_UMaxV) {
 929       evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 930     } else {
 931       assert(opcode == Op_UMinV, "required");
 932       evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 933     }
 934   } else {
 935     // T1 = -1
 936     vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
 937     // T1 = -1 << 63
 938     vpsllq(xtmp1, xtmp1, 63, vlen_enc);
 939     // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
 940     vpaddq(xtmp2, xtmp1, src2, vlen_enc);
 941     // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
 942     vpaddq(xtmp1, xtmp1, src1, vlen_enc);
 943     // Mask = T2 > T1
 944     vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
 945     if (opcode == Op_UMaxV) {
 946       // Res = Mask ? Src2 : Src1
 947       vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
 948     } else {
 949       // Res = Mask ? Src1 : Src2
 950       vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
 951     }
 952   }
 953 }
 954 
 955 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 956                                   XMMRegister src1, XMMRegister src2, int vlen_enc) {
 957   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 958   if (opcode == Op_UMinV) {
 959     switch(elem_bt) {
 960       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 961       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 962       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 963       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 964       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 965     }
 966   } else {
 967     assert(opcode == Op_UMaxV, "required");
 968     switch(elem_bt) {
 969       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 970       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 971       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 972       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 973       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 974     }
 975   }
 976 }
 977 
 978 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 979                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 980                                  int vlen_enc) {
 981   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 982 
 983   if (opcode == Op_MinV) {
 984     if (elem_bt == T_BYTE) {
 985       vpminsb(dst, src1, src2, vlen_enc);
 986     } else if (elem_bt == T_SHORT) {
 987       vpminsw(dst, src1, src2, vlen_enc);
 988     } else if (elem_bt == T_INT) {
 989       vpminsd(dst, src1, src2, vlen_enc);
 990     } else {
 991       assert(elem_bt == T_LONG, "required");
 992       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 993         vpminsq(dst, src1, src2, vlen_enc);
 994       } else {
 995         assert_different_registers(dst, src1, src2);
 996         vpcmpgtq(dst, src1, src2, vlen_enc);
 997         vblendvpd(dst, src1, src2, dst, vlen_enc);
 998       }
 999     }
1000   } else { // opcode == Op_MaxV
1001     if (elem_bt == T_BYTE) {
1002       vpmaxsb(dst, src1, src2, vlen_enc);
1003     } else if (elem_bt == T_SHORT) {
1004       vpmaxsw(dst, src1, src2, vlen_enc);
1005     } else if (elem_bt == T_INT) {
1006       vpmaxsd(dst, src1, src2, vlen_enc);
1007     } else {
1008       assert(elem_bt == T_LONG, "required");
1009       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1010         vpmaxsq(dst, src1, src2, vlen_enc);
1011       } else {
1012         assert_different_registers(dst, src1, src2);
1013         vpcmpgtq(dst, src1, src2, vlen_enc);
1014         vblendvpd(dst, src2, src1, dst, vlen_enc);
1015       }
1016     }
1017   }
1018 }
1019 
1020 // Float/Double min max
1021 
1022 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1023                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1024                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1025                                    int vlen_enc) {
1026   assert(UseAVX > 0, "required");
1027   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1028          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1029   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1030   assert_different_registers(a, tmp, atmp, btmp);
1031   assert_different_registers(b, tmp, atmp, btmp);
1032 
1033   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1034   bool is_double_word = is_double_word_type(elem_bt);
1035 
1036   /* Note on 'non-obvious' assembly sequence:
1037    *
1038    * While there are vminps/vmaxps instructions, there are two important differences between hardware
1039    * and Java on how they handle floats:
1040    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
1041    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
1042    *
1043    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
1044    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
1045    *                (only useful when signs differ, noop otherwise)
1046    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
1047 
1048    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
1049    *   btmp = (b < +0.0) ? a : b
1050    *   atmp = (b < +0.0) ? b : a
1051    *   Tmp  = Max_Float(atmp , btmp)
1052    *   Res  = (atmp == NaN) ? atmp : Tmp
1053    */
1054 
1055   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
1056   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
1057   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
1058   XMMRegister mask;
1059 
1060   if (!is_double_word && is_min) {
1061     mask = a;
1062     vblend = &MacroAssembler::vblendvps;
1063     vmaxmin = &MacroAssembler::vminps;
1064     vcmp = &MacroAssembler::vcmpps;
1065   } else if (!is_double_word && !is_min) {
1066     mask = b;
1067     vblend = &MacroAssembler::vblendvps;
1068     vmaxmin = &MacroAssembler::vmaxps;
1069     vcmp = &MacroAssembler::vcmpps;
1070   } else if (is_double_word && is_min) {
1071     mask = a;
1072     vblend = &MacroAssembler::vblendvpd;
1073     vmaxmin = &MacroAssembler::vminpd;
1074     vcmp = &MacroAssembler::vcmppd;
1075   } else {
1076     assert(is_double_word && !is_min, "sanity");
1077     mask = b;
1078     vblend = &MacroAssembler::vblendvpd;
1079     vmaxmin = &MacroAssembler::vmaxpd;
1080     vcmp = &MacroAssembler::vcmppd;
1081   }
1082 
1083   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
1084   XMMRegister maxmin, scratch;
1085   if (dst == btmp) {
1086     maxmin = btmp;
1087     scratch = tmp;
1088   } else {
1089     maxmin = tmp;
1090     scratch = btmp;
1091   }
1092 
1093   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1094   if (precompute_mask && !is_double_word) {
1095     vpsrad(tmp, mask, 32, vlen_enc);
1096     mask = tmp;
1097   } else if (precompute_mask && is_double_word) {
1098     vpxor(tmp, tmp, tmp, vlen_enc);
1099     vpcmpgtq(tmp, tmp, mask, vlen_enc);
1100     mask = tmp;
1101   }
1102 
1103   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1104   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1105   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1106   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1107   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1108 }
1109 
1110 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1111                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1112                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1113                                     int vlen_enc) {
1114   assert(UseAVX > 2, "required");
1115   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1116          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1117   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1118   assert_different_registers(dst, a, atmp, btmp);
1119   assert_different_registers(dst, b, atmp, btmp);
1120 
1121   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1122   bool is_double_word = is_double_word_type(elem_bt);
1123   bool merge = true;
1124 
1125   if (!is_double_word && is_min) {
1126     evpmovd2m(ktmp, a, vlen_enc);
1127     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1128     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1129     vminps(dst, atmp, btmp, vlen_enc);
1130     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1131     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1132   } else if (!is_double_word && !is_min) {
1133     evpmovd2m(ktmp, b, vlen_enc);
1134     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1135     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1136     vmaxps(dst, atmp, btmp, vlen_enc);
1137     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1138     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1139   } else if (is_double_word && is_min) {
1140     evpmovq2m(ktmp, a, vlen_enc);
1141     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1142     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1143     vminpd(dst, atmp, btmp, vlen_enc);
1144     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1145     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1146   } else {
1147     assert(is_double_word && !is_min, "sanity");
1148     evpmovq2m(ktmp, b, vlen_enc);
1149     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1150     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1151     vmaxpd(dst, atmp, btmp, vlen_enc);
1152     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1153     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1154   }
1155 }
1156 
1157 // Float/Double signum
1158 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1159   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1160 
1161   Label DONE_LABEL;
1162 
1163   if (opcode == Op_SignumF) {
1164     ucomiss(dst, zero);
1165     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1166     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1167     movflt(dst, one);
1168     jcc(Assembler::above, DONE_LABEL);
1169     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1170   } else if (opcode == Op_SignumD) {
1171     ucomisd(dst, zero);
1172     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1173     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1174     movdbl(dst, one);
1175     jcc(Assembler::above, DONE_LABEL);
1176     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1177   }
1178 
1179   bind(DONE_LABEL);
1180 }
1181 
1182 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1183   if (sign) {
1184     pmovsxbw(dst, src);
1185   } else {
1186     pmovzxbw(dst, src);
1187   }
1188 }
1189 
1190 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1191   if (sign) {
1192     vpmovsxbw(dst, src, vector_len);
1193   } else {
1194     vpmovzxbw(dst, src, vector_len);
1195   }
1196 }
1197 
1198 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1199   if (sign) {
1200     vpmovsxbd(dst, src, vector_len);
1201   } else {
1202     vpmovzxbd(dst, src, vector_len);
1203   }
1204 }
1205 
1206 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1207   if (sign) {
1208     vpmovsxwd(dst, src, vector_len);
1209   } else {
1210     vpmovzxwd(dst, src, vector_len);
1211   }
1212 }
1213 
1214 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1215                                      int shift, int vector_len) {
1216   if (opcode == Op_RotateLeftV) {
1217     if (etype == T_INT) {
1218       evprold(dst, src, shift, vector_len);
1219     } else {
1220       assert(etype == T_LONG, "expected type T_LONG");
1221       evprolq(dst, src, shift, vector_len);
1222     }
1223   } else {
1224     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1225     if (etype == T_INT) {
1226       evprord(dst, src, shift, vector_len);
1227     } else {
1228       assert(etype == T_LONG, "expected type T_LONG");
1229       evprorq(dst, src, shift, vector_len);
1230     }
1231   }
1232 }
1233 
1234 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1235                                      XMMRegister shift, int vector_len) {
1236   if (opcode == Op_RotateLeftV) {
1237     if (etype == T_INT) {
1238       evprolvd(dst, src, shift, vector_len);
1239     } else {
1240       assert(etype == T_LONG, "expected type T_LONG");
1241       evprolvq(dst, src, shift, vector_len);
1242     }
1243   } else {
1244     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1245     if (etype == T_INT) {
1246       evprorvd(dst, src, shift, vector_len);
1247     } else {
1248       assert(etype == T_LONG, "expected type T_LONG");
1249       evprorvq(dst, src, shift, vector_len);
1250     }
1251   }
1252 }
1253 
1254 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1255   if (opcode == Op_RShiftVI) {
1256     psrad(dst, shift);
1257   } else if (opcode == Op_LShiftVI) {
1258     pslld(dst, shift);
1259   } else {
1260     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1261     psrld(dst, shift);
1262   }
1263 }
1264 
1265 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1266   switch (opcode) {
1267     case Op_RShiftVI:  psrad(dst, shift); break;
1268     case Op_LShiftVI:  pslld(dst, shift); break;
1269     case Op_URShiftVI: psrld(dst, shift); break;
1270 
1271     default: assert(false, "%s", NodeClassNames[opcode]);
1272   }
1273 }
1274 
1275 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1276   if (opcode == Op_RShiftVI) {
1277     vpsrad(dst, nds, shift, vector_len);
1278   } else if (opcode == Op_LShiftVI) {
1279     vpslld(dst, nds, shift, vector_len);
1280   } else {
1281     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1282     vpsrld(dst, nds, shift, vector_len);
1283   }
1284 }
1285 
1286 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1287   switch (opcode) {
1288     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1289     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1290     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1291 
1292     default: assert(false, "%s", NodeClassNames[opcode]);
1293   }
1294 }
1295 
1296 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1297   switch (opcode) {
1298     case Op_RShiftVB:  // fall-through
1299     case Op_RShiftVS:  psraw(dst, shift); break;
1300 
1301     case Op_LShiftVB:  // fall-through
1302     case Op_LShiftVS:  psllw(dst, shift);   break;
1303 
1304     case Op_URShiftVS: // fall-through
1305     case Op_URShiftVB: psrlw(dst, shift);  break;
1306 
1307     default: assert(false, "%s", NodeClassNames[opcode]);
1308   }
1309 }
1310 
1311 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1312   switch (opcode) {
1313     case Op_RShiftVB:  // fall-through
1314     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1315 
1316     case Op_LShiftVB:  // fall-through
1317     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1318 
1319     case Op_URShiftVS: // fall-through
1320     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1321 
1322     default: assert(false, "%s", NodeClassNames[opcode]);
1323   }
1324 }
1325 
1326 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1327   switch (opcode) {
1328     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1329     case Op_LShiftVL:  psllq(dst, shift); break;
1330     case Op_URShiftVL: psrlq(dst, shift); break;
1331 
1332     default: assert(false, "%s", NodeClassNames[opcode]);
1333   }
1334 }
1335 
1336 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1337   if (opcode == Op_RShiftVL) {
1338     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1339   } else if (opcode == Op_LShiftVL) {
1340     psllq(dst, shift);
1341   } else {
1342     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1343     psrlq(dst, shift);
1344   }
1345 }
1346 
1347 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1348   switch (opcode) {
1349     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1350     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1351     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1352 
1353     default: assert(false, "%s", NodeClassNames[opcode]);
1354   }
1355 }
1356 
1357 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1358   if (opcode == Op_RShiftVL) {
1359     evpsraq(dst, nds, shift, vector_len);
1360   } else if (opcode == Op_LShiftVL) {
1361     vpsllq(dst, nds, shift, vector_len);
1362   } else {
1363     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1364     vpsrlq(dst, nds, shift, vector_len);
1365   }
1366 }
1367 
1368 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1369   switch (opcode) {
1370     case Op_RShiftVB:  // fall-through
1371     case Op_RShiftVS:  // fall-through
1372     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1373 
1374     case Op_LShiftVB:  // fall-through
1375     case Op_LShiftVS:  // fall-through
1376     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1377 
1378     case Op_URShiftVB: // fall-through
1379     case Op_URShiftVS: // fall-through
1380     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1381 
1382     default: assert(false, "%s", NodeClassNames[opcode]);
1383   }
1384 }
1385 
1386 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1387   switch (opcode) {
1388     case Op_RShiftVB:  // fall-through
1389     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1390 
1391     case Op_LShiftVB:  // fall-through
1392     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1393 
1394     case Op_URShiftVB: // fall-through
1395     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1396 
1397     default: assert(false, "%s", NodeClassNames[opcode]);
1398   }
1399 }
1400 
1401 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1402   assert(UseAVX >= 2, "required");
1403   switch (opcode) {
1404     case Op_RShiftVL: {
1405       if (UseAVX > 2) {
1406         assert(tmp == xnoreg, "not used");
1407         if (!VM_Version::supports_avx512vl()) {
1408           vlen_enc = Assembler::AVX_512bit;
1409         }
1410         evpsravq(dst, src, shift, vlen_enc);
1411       } else {
1412         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1413         vpsrlvq(dst, src, shift, vlen_enc);
1414         vpsrlvq(tmp, tmp, shift, vlen_enc);
1415         vpxor(dst, dst, tmp, vlen_enc);
1416         vpsubq(dst, dst, tmp, vlen_enc);
1417       }
1418       break;
1419     }
1420     case Op_LShiftVL: {
1421       assert(tmp == xnoreg, "not used");
1422       vpsllvq(dst, src, shift, vlen_enc);
1423       break;
1424     }
1425     case Op_URShiftVL: {
1426       assert(tmp == xnoreg, "not used");
1427       vpsrlvq(dst, src, shift, vlen_enc);
1428       break;
1429     }
1430     default: assert(false, "%s", NodeClassNames[opcode]);
1431   }
1432 }
1433 
1434 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1435 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1436   assert(opcode == Op_LShiftVB ||
1437          opcode == Op_RShiftVB ||
1438          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1439   bool sign = (opcode != Op_URShiftVB);
1440   assert(vector_len == 0, "required");
1441   vextendbd(sign, dst, src, 1);
1442   vpmovzxbd(vtmp, shift, 1);
1443   varshiftd(opcode, dst, dst, vtmp, 1);
1444   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1445   vextracti128_high(vtmp, dst);
1446   vpackusdw(dst, dst, vtmp, 0);
1447 }
1448 
1449 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1450 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1451   assert(opcode == Op_LShiftVB ||
1452          opcode == Op_RShiftVB ||
1453          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1454   bool sign = (opcode != Op_URShiftVB);
1455   int ext_vector_len = vector_len + 1;
1456   vextendbw(sign, dst, src, ext_vector_len);
1457   vpmovzxbw(vtmp, shift, ext_vector_len);
1458   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1459   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1460   if (vector_len == 0) {
1461     vextracti128_high(vtmp, dst);
1462     vpackuswb(dst, dst, vtmp, vector_len);
1463   } else {
1464     vextracti64x4_high(vtmp, dst);
1465     vpackuswb(dst, dst, vtmp, vector_len);
1466     vpermq(dst, dst, 0xD8, vector_len);
1467   }
1468 }
1469 
1470 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1471   switch(typ) {
1472     case T_BYTE:
1473       pinsrb(dst, val, idx);
1474       break;
1475     case T_SHORT:
1476       pinsrw(dst, val, idx);
1477       break;
1478     case T_INT:
1479       pinsrd(dst, val, idx);
1480       break;
1481     case T_LONG:
1482       pinsrq(dst, val, idx);
1483       break;
1484     default:
1485       assert(false,"Should not reach here.");
1486       break;
1487   }
1488 }
1489 
1490 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1491   switch(typ) {
1492     case T_BYTE:
1493       vpinsrb(dst, src, val, idx);
1494       break;
1495     case T_SHORT:
1496       vpinsrw(dst, src, val, idx);
1497       break;
1498     case T_INT:
1499       vpinsrd(dst, src, val, idx);
1500       break;
1501     case T_LONG:
1502       vpinsrq(dst, src, val, idx);
1503       break;
1504     default:
1505       assert(false,"Should not reach here.");
1506       break;
1507   }
1508 }
1509 
1510 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt,
1511                                                 XMMRegister dst, Register base,
1512                                                 Register idx_base,
1513                                                 Register offset, Register mask,
1514                                                 Register mask_idx, Register rtmp,
1515                                                 int vlen_enc) {
1516   vpxor(dst, dst, dst, vlen_enc);
1517   if (elem_bt == T_SHORT) {
1518     for (int i = 0; i < 4; i++) {
1519       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1520       Label skip_load;
1521       btq(mask, mask_idx);
1522       jccb(Assembler::carryClear, skip_load);
1523       movl(rtmp, Address(idx_base, i * 4));
1524       if (offset != noreg) {
1525         addl(rtmp, offset);
1526       }
1527       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1528       bind(skip_load);
1529       incq(mask_idx);
1530     }
1531   } else {
1532     assert(elem_bt == T_BYTE, "");
1533     for (int i = 0; i < 8; i++) {
1534       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1535       Label skip_load;
1536       btq(mask, mask_idx);
1537       jccb(Assembler::carryClear, skip_load);
1538       movl(rtmp, Address(idx_base, i * 4));
1539       if (offset != noreg) {
1540         addl(rtmp, offset);
1541       }
1542       pinsrb(dst, Address(base, rtmp), i);
1543       bind(skip_load);
1544       incq(mask_idx);
1545     }
1546   }
1547 }
1548 
1549 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst,
1550                                          Register base, Register idx_base,
1551                                          Register offset, Register rtmp,
1552                                          int vlen_enc) {
1553   vpxor(dst, dst, dst, vlen_enc);
1554   if (elem_bt == T_SHORT) {
1555     for (int i = 0; i < 4; i++) {
1556       // dst[i] = src[offset + idx_base[i]]
1557       movl(rtmp, Address(idx_base, i * 4));
1558       if (offset != noreg) {
1559         addl(rtmp, offset);
1560       }
1561       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1562     }
1563   } else {
1564     assert(elem_bt == T_BYTE, "");
1565     for (int i = 0; i < 8; i++) {
1566       // dst[i] = src[offset + idx_base[i]]
1567       movl(rtmp, Address(idx_base, i * 4));
1568       if (offset != noreg) {
1569         addl(rtmp, offset);
1570       }
1571       pinsrb(dst, Address(base, rtmp), i);
1572     }
1573   }
1574 }
1575 
1576 /*
1577  * Gather using hybrid algorithm, first partially unroll scalar loop
1578  * to accumulate values from gather indices into a quad-word(64bit) slice.
1579  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1580  * permutation to place the slice into appropriate vector lane
1581  * locations in destination vector. Following pseudo code describes the
1582  * algorithm in detail:
1583  *
1584  * DST_VEC = ZERO_VEC
1585  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1586  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1587  * FOREACH_ITER:
1588  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1589  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1590  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1591  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1592  *
1593  * With each iteration, doubleword permute indices (0,1) corresponding
1594  * to gathered quadword gets right shifted by two lane positions.
1595  *
1596  */
1597 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1598                                         Register base, Register idx_base,
1599                                         Register offset, Register mask,
1600                                         XMMRegister xtmp1, XMMRegister xtmp2,
1601                                         XMMRegister temp_dst, Register rtmp,
1602                                         Register mask_idx, Register length,
1603                                         int vector_len, int vlen_enc) {
1604   Label GATHER8_LOOP;
1605   assert(is_subword_type(elem_ty), "");
1606   movl(length, vector_len);
1607   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1608   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1609   vallones(xtmp2, vlen_enc);
1610   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1611   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1612   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1613 
1614   bind(GATHER8_LOOP);
1615     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1616     if (mask == noreg) {
1617       vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc);
1618     } else {
1619       vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc);
1620     }
1621     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1622     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1623     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1624     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1625     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1626     vpor(dst, dst, temp_dst, vlen_enc);
1627     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1628     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1629     jcc(Assembler::notEqual, GATHER8_LOOP);
1630 }
1631 
1632 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1633   switch(typ) {
1634     case T_INT:
1635       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1636       break;
1637     case T_FLOAT:
1638       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1639       break;
1640     case T_LONG:
1641       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1642       break;
1643     case T_DOUBLE:
1644       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1645       break;
1646     default:
1647       assert(false,"Should not reach here.");
1648       break;
1649   }
1650 }
1651 
1652 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1653   switch(typ) {
1654     case T_INT:
1655       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1656       break;
1657     case T_FLOAT:
1658       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1659       break;
1660     case T_LONG:
1661       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1662       break;
1663     case T_DOUBLE:
1664       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1665       break;
1666     default:
1667       assert(false,"Should not reach here.");
1668       break;
1669   }
1670 }
1671 
1672 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1673   switch(typ) {
1674     case T_INT:
1675       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1676       break;
1677     case T_FLOAT:
1678       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1679       break;
1680     case T_LONG:
1681       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1682       break;
1683     case T_DOUBLE:
1684       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1685       break;
1686     default:
1687       assert(false,"Should not reach here.");
1688       break;
1689   }
1690 }
1691 
1692 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1693   if (vlen_in_bytes <= 16) {
1694     pxor (dst, dst);
1695     psubb(dst, src);
1696     switch (elem_bt) {
1697       case T_BYTE:   /* nothing to do */ break;
1698       case T_SHORT:  pmovsxbw(dst, dst); break;
1699       case T_INT:    pmovsxbd(dst, dst); break;
1700       case T_FLOAT:  pmovsxbd(dst, dst); break;
1701       case T_LONG:   pmovsxbq(dst, dst); break;
1702       case T_DOUBLE: pmovsxbq(dst, dst); break;
1703 
1704       default: assert(false, "%s", type2name(elem_bt));
1705     }
1706   } else {
1707     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1708     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1709 
1710     vpxor (dst, dst, dst, vlen_enc);
1711     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1712 
1713     switch (elem_bt) {
1714       case T_BYTE:   /* nothing to do */            break;
1715       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1716       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1717       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1718       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1719       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1720 
1721       default: assert(false, "%s", type2name(elem_bt));
1722     }
1723   }
1724 }
1725 
1726 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1727   if (novlbwdq) {
1728     vpmovsxbd(xtmp, src, vlen_enc);
1729     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1730             Assembler::eq, true, vlen_enc, noreg);
1731   } else {
1732     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1733     vpsubb(xtmp, xtmp, src, vlen_enc);
1734     evpmovb2m(dst, xtmp, vlen_enc);
1735   }
1736 }
1737 
1738 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1739   if (is_integral_type(bt)) {
1740     switch (vlen_in_bytes) {
1741       case 4:  movdl(dst, src);   break;
1742       case 8:  movq(dst, src);    break;
1743       case 16: movdqu(dst, src);  break;
1744       case 32: vmovdqu(dst, src); break;
1745       case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1746       default: ShouldNotReachHere();
1747     }
1748   } else {
1749     switch (vlen_in_bytes) {
1750       case 4:  movflt(dst, src); break;
1751       case 8:  movdbl(dst, src); break;
1752       case 16: movups(dst, src); break;
1753       case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1754       case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1755       default: ShouldNotReachHere();
1756     }
1757   }
1758 }
1759 
1760 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1761   assert(rscratch != noreg || always_reachable(src), "missing");
1762 
1763   if (reachable(src)) {
1764     load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1765   } else {
1766     lea(rscratch, src);
1767     load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1768   }
1769 }
1770 
1771 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1772   int vlen_enc = vector_length_encoding(vlen);
1773   if (VM_Version::supports_avx()) {
1774     if (bt == T_LONG) {
1775       if (VM_Version::supports_avx2()) {
1776         vpbroadcastq(dst, src, vlen_enc);
1777       } else {
1778         vmovddup(dst, src, vlen_enc);
1779       }
1780     } else if (bt == T_DOUBLE) {
1781       if (vlen_enc != Assembler::AVX_128bit) {
1782         vbroadcastsd(dst, src, vlen_enc, noreg);
1783       } else {
1784         vmovddup(dst, src, vlen_enc);
1785       }
1786     } else {
1787       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1788         vpbroadcastd(dst, src, vlen_enc);
1789       } else {
1790         vbroadcastss(dst, src, vlen_enc);
1791       }
1792     }
1793   } else if (VM_Version::supports_sse3()) {
1794     movddup(dst, src);
1795   } else {
1796     load_vector(bt, dst, src, vlen);
1797   }
1798 }
1799 
1800 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1801   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1802   int offset = exact_log2(type2aelembytes(bt)) << 6;
1803   if (is_floating_point_type(bt)) {
1804     offset += 128;
1805   }
1806   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1807   load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1808 }
1809 
1810 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1811 
1812 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1813   int vector_len = Assembler::AVX_128bit;
1814 
1815   switch (opcode) {
1816     case Op_AndReductionV:  pand(dst, src); break;
1817     case Op_OrReductionV:   por (dst, src); break;
1818     case Op_XorReductionV:  pxor(dst, src); break;
1819     case Op_MinReductionV:
1820       switch (typ) {
1821         case T_BYTE:        pminsb(dst, src); break;
1822         case T_SHORT:       pminsw(dst, src); break;
1823         case T_INT:         pminsd(dst, src); break;
1824         case T_LONG:        assert(UseAVX > 2, "required");
1825                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1826         default:            assert(false, "wrong type");
1827       }
1828       break;
1829     case Op_MaxReductionV:
1830       switch (typ) {
1831         case T_BYTE:        pmaxsb(dst, src); break;
1832         case T_SHORT:       pmaxsw(dst, src); break;
1833         case T_INT:         pmaxsd(dst, src); break;
1834         case T_LONG:        assert(UseAVX > 2, "required");
1835                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1836         default:            assert(false, "wrong type");
1837       }
1838       break;
1839     case Op_AddReductionVF: addss(dst, src); break;
1840     case Op_AddReductionVD: addsd(dst, src); break;
1841     case Op_AddReductionVI:
1842       switch (typ) {
1843         case T_BYTE:        paddb(dst, src); break;
1844         case T_SHORT:       paddw(dst, src); break;
1845         case T_INT:         paddd(dst, src); break;
1846         default:            assert(false, "wrong type");
1847       }
1848       break;
1849     case Op_AddReductionVL: paddq(dst, src); break;
1850     case Op_MulReductionVF: mulss(dst, src); break;
1851     case Op_MulReductionVD: mulsd(dst, src); break;
1852     case Op_MulReductionVI:
1853       switch (typ) {
1854         case T_SHORT:       pmullw(dst, src); break;
1855         case T_INT:         pmulld(dst, src); break;
1856         default:            assert(false, "wrong type");
1857       }
1858       break;
1859     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1860                             evpmullq(dst, dst, src, vector_len); break;
1861     default:                assert(false, "wrong opcode");
1862   }
1863 }
1864 
1865 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1866   switch (opcode) {
1867     case Op_AddReductionVF: addps(dst, src); break;
1868     case Op_AddReductionVD: addpd(dst, src); break;
1869     case Op_MulReductionVF: mulps(dst, src); break;
1870     case Op_MulReductionVD: mulpd(dst, src); break;
1871     default:                assert(false, "%s", NodeClassNames[opcode]);
1872   }
1873 }
1874 
1875 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1876   int vector_len = Assembler::AVX_256bit;
1877 
1878   switch (opcode) {
1879     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1880     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1881     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1882     case Op_MinReductionV:
1883       switch (typ) {
1884         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1885         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1886         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1887         case T_LONG:        assert(UseAVX > 2, "required");
1888                             vpminsq(dst, src1, src2, vector_len); break;
1889         default:            assert(false, "wrong type");
1890       }
1891       break;
1892     case Op_MaxReductionV:
1893       switch (typ) {
1894         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1895         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1896         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1897         case T_LONG:        assert(UseAVX > 2, "required");
1898                             vpmaxsq(dst, src1, src2, vector_len); break;
1899         default:            assert(false, "wrong type");
1900       }
1901       break;
1902     case Op_AddReductionVI:
1903       switch (typ) {
1904         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1905         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1906         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1907         default:            assert(false, "wrong type");
1908       }
1909       break;
1910     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1911     case Op_MulReductionVI:
1912       switch (typ) {
1913         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1914         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1915         default:            assert(false, "wrong type");
1916       }
1917       break;
1918     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1919     default:                assert(false, "wrong opcode");
1920   }
1921 }
1922 
1923 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1924   int vector_len = Assembler::AVX_256bit;
1925 
1926   switch (opcode) {
1927     case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1928     case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1929     case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1930     case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1931     default:                assert(false, "%s", NodeClassNames[opcode]);
1932   }
1933 }
1934 
1935 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1936                                   XMMRegister dst, XMMRegister src,
1937                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1938   switch (opcode) {
1939     case Op_AddReductionVF:
1940     case Op_MulReductionVF:
1941       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1942       break;
1943 
1944     case Op_AddReductionVD:
1945     case Op_MulReductionVD:
1946       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1947       break;
1948 
1949     default: assert(false, "wrong opcode");
1950   }
1951 }
1952 
1953 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1954                                             XMMRegister dst, XMMRegister src,
1955                                             XMMRegister vtmp1, XMMRegister vtmp2) {
1956   switch (opcode) {
1957     case Op_AddReductionVF:
1958     case Op_MulReductionVF:
1959       unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1960       break;
1961 
1962     case Op_AddReductionVD:
1963     case Op_MulReductionVD:
1964       unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1965       break;
1966 
1967     default: assert(false, "%s", NodeClassNames[opcode]);
1968   }
1969 }
1970 
1971 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1972                              Register dst, Register src1, XMMRegister src2,
1973                              XMMRegister vtmp1, XMMRegister vtmp2) {
1974   switch (vlen) {
1975     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1976     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1977     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1978     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1979 
1980     default: assert(false, "wrong vector length");
1981   }
1982 }
1983 
1984 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1985                              Register dst, Register src1, XMMRegister src2,
1986                              XMMRegister vtmp1, XMMRegister vtmp2) {
1987   switch (vlen) {
1988     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1989     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1990     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1991     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1992 
1993     default: assert(false, "wrong vector length");
1994   }
1995 }
1996 
1997 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1998                              Register dst, Register src1, XMMRegister src2,
1999                              XMMRegister vtmp1, XMMRegister vtmp2) {
2000   switch (vlen) {
2001     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2002     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2003     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2004     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2005 
2006     default: assert(false, "wrong vector length");
2007   }
2008 }
2009 
2010 void C2_MacroAssembler::reduceI(int opcode, int vlen,
2011                              Register dst, Register src1, XMMRegister src2,
2012                              XMMRegister vtmp1, XMMRegister vtmp2) {
2013   switch (vlen) {
2014     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2015     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2016     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2017     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2018 
2019     default: assert(false, "wrong vector length");
2020   }
2021 }
2022 
2023 void C2_MacroAssembler::reduceL(int opcode, int vlen,
2024                              Register dst, Register src1, XMMRegister src2,
2025                              XMMRegister vtmp1, XMMRegister vtmp2) {
2026   switch (vlen) {
2027     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2028     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2029     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2030 
2031     default: assert(false, "wrong vector length");
2032   }
2033 }
2034 
2035 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2036   switch (vlen) {
2037     case 2:
2038       assert(vtmp2 == xnoreg, "");
2039       reduce2F(opcode, dst, src, vtmp1);
2040       break;
2041     case 4:
2042       assert(vtmp2 == xnoreg, "");
2043       reduce4F(opcode, dst, src, vtmp1);
2044       break;
2045     case 8:
2046       reduce8F(opcode, dst, src, vtmp1, vtmp2);
2047       break;
2048     case 16:
2049       reduce16F(opcode, dst, src, vtmp1, vtmp2);
2050       break;
2051     default: assert(false, "wrong vector length");
2052   }
2053 }
2054 
2055 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2056   switch (vlen) {
2057     case 2:
2058       assert(vtmp2 == xnoreg, "");
2059       reduce2D(opcode, dst, src, vtmp1);
2060       break;
2061     case 4:
2062       reduce4D(opcode, dst, src, vtmp1, vtmp2);
2063       break;
2064     case 8:
2065       reduce8D(opcode, dst, src, vtmp1, vtmp2);
2066       break;
2067     default: assert(false, "wrong vector length");
2068   }
2069 }
2070 
2071 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2072   switch (vlen) {
2073     case 2:
2074       assert(vtmp1 == xnoreg, "");
2075       assert(vtmp2 == xnoreg, "");
2076       unorderedReduce2F(opcode, dst, src);
2077       break;
2078     case 4:
2079       assert(vtmp2 == xnoreg, "");
2080       unorderedReduce4F(opcode, dst, src, vtmp1);
2081       break;
2082     case 8:
2083       unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
2084       break;
2085     case 16:
2086       unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
2087       break;
2088     default: assert(false, "wrong vector length");
2089   }
2090 }
2091 
2092 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2093   switch (vlen) {
2094     case 2:
2095       assert(vtmp1 == xnoreg, "");
2096       assert(vtmp2 == xnoreg, "");
2097       unorderedReduce2D(opcode, dst, src);
2098       break;
2099     case 4:
2100       assert(vtmp2 == xnoreg, "");
2101       unorderedReduce4D(opcode, dst, src, vtmp1);
2102       break;
2103     case 8:
2104       unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2105       break;
2106     default: assert(false, "wrong vector length");
2107   }
2108 }
2109 
2110 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2111   if (opcode == Op_AddReductionVI) {
2112     if (vtmp1 != src2) {
2113       movdqu(vtmp1, src2);
2114     }
2115     phaddd(vtmp1, vtmp1);
2116   } else {
2117     pshufd(vtmp1, src2, 0x1);
2118     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2119   }
2120   movdl(vtmp2, src1);
2121   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2122   movdl(dst, vtmp1);
2123 }
2124 
2125 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2126   if (opcode == Op_AddReductionVI) {
2127     if (vtmp1 != src2) {
2128       movdqu(vtmp1, src2);
2129     }
2130     phaddd(vtmp1, src2);
2131     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2132   } else {
2133     pshufd(vtmp2, src2, 0xE);
2134     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2135     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2136   }
2137 }
2138 
2139 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2140   if (opcode == Op_AddReductionVI) {
2141     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2142     vextracti128_high(vtmp2, vtmp1);
2143     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2144     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2145   } else {
2146     vextracti128_high(vtmp1, src2);
2147     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2148     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2149   }
2150 }
2151 
2152 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2153   vextracti64x4_high(vtmp2, src2);
2154   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2155   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2156 }
2157 
2158 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2159   pshufd(vtmp2, src2, 0x1);
2160   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2161   movdqu(vtmp1, vtmp2);
2162   psrldq(vtmp1, 2);
2163   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2164   movdqu(vtmp2, vtmp1);
2165   psrldq(vtmp2, 1);
2166   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2167   movdl(vtmp2, src1);
2168   pmovsxbd(vtmp1, vtmp1);
2169   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2170   pextrb(dst, vtmp1, 0x0);
2171   movsbl(dst, dst);
2172 }
2173 
2174 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2175   pshufd(vtmp1, src2, 0xE);
2176   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2177   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2178 }
2179 
2180 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2181   vextracti128_high(vtmp2, src2);
2182   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2183   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2184 }
2185 
2186 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2187   vextracti64x4_high(vtmp1, src2);
2188   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2189   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2190 }
2191 
2192 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2193   pmovsxbw(vtmp2, src2);
2194   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2195 }
2196 
2197 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2198   if (UseAVX > 1) {
2199     int vector_len = Assembler::AVX_256bit;
2200     vpmovsxbw(vtmp1, src2, vector_len);
2201     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2202   } else {
2203     pmovsxbw(vtmp2, src2);
2204     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2205     pshufd(vtmp2, src2, 0x1);
2206     pmovsxbw(vtmp2, src2);
2207     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2208   }
2209 }
2210 
2211 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2212   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2213     int vector_len = Assembler::AVX_512bit;
2214     vpmovsxbw(vtmp1, src2, vector_len);
2215     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2216   } else {
2217     assert(UseAVX >= 2,"Should not reach here.");
2218     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2219     vextracti128_high(vtmp2, src2);
2220     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2221   }
2222 }
2223 
2224 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2225   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2226   vextracti64x4_high(vtmp2, src2);
2227   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2228 }
2229 
2230 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2231   if (opcode == Op_AddReductionVI) {
2232     if (vtmp1 != src2) {
2233       movdqu(vtmp1, src2);
2234     }
2235     phaddw(vtmp1, vtmp1);
2236     phaddw(vtmp1, vtmp1);
2237   } else {
2238     pshufd(vtmp2, src2, 0x1);
2239     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2240     movdqu(vtmp1, vtmp2);
2241     psrldq(vtmp1, 2);
2242     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2243   }
2244   movdl(vtmp2, src1);
2245   pmovsxwd(vtmp1, vtmp1);
2246   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2247   pextrw(dst, vtmp1, 0x0);
2248   movswl(dst, dst);
2249 }
2250 
2251 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2252   if (opcode == Op_AddReductionVI) {
2253     if (vtmp1 != src2) {
2254       movdqu(vtmp1, src2);
2255     }
2256     phaddw(vtmp1, src2);
2257   } else {
2258     pshufd(vtmp1, src2, 0xE);
2259     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2260   }
2261   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2262 }
2263 
2264 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2265   if (opcode == Op_AddReductionVI) {
2266     int vector_len = Assembler::AVX_256bit;
2267     vphaddw(vtmp2, src2, src2, vector_len);
2268     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2269   } else {
2270     vextracti128_high(vtmp2, src2);
2271     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2272   }
2273   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2274 }
2275 
2276 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2277   int vector_len = Assembler::AVX_256bit;
2278   vextracti64x4_high(vtmp1, src2);
2279   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2280   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2281 }
2282 
2283 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2284   pshufd(vtmp2, src2, 0xE);
2285   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2286   movdq(vtmp1, src1);
2287   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2288   movdq(dst, vtmp1);
2289 }
2290 
2291 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2292   vextracti128_high(vtmp1, src2);
2293   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2294   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2295 }
2296 
2297 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2298   vextracti64x4_high(vtmp2, src2);
2299   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2300   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2301 }
2302 
2303 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2304   mov64(temp, -1L);
2305   bzhiq(temp, temp, len);
2306   kmovql(dst, temp);
2307 }
2308 
2309 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2310   reduce_operation_128(T_FLOAT, opcode, dst, src);
2311   pshufd(vtmp, src, 0x1);
2312   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2313 }
2314 
2315 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2316   reduce2F(opcode, dst, src, vtmp);
2317   pshufd(vtmp, src, 0x2);
2318   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2319   pshufd(vtmp, src, 0x3);
2320   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2321 }
2322 
2323 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2324   reduce4F(opcode, dst, src, vtmp2);
2325   vextractf128_high(vtmp2, src);
2326   reduce4F(opcode, dst, vtmp2, vtmp1);
2327 }
2328 
2329 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2330   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2331   vextracti64x4_high(vtmp1, src);
2332   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2333 }
2334 
2335 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2336   pshufd(dst, src, 0x1);
2337   reduce_operation_128(T_FLOAT, opcode, dst, src);
2338 }
2339 
2340 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2341   pshufd(vtmp, src, 0xE);
2342   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2343   unorderedReduce2F(opcode, dst, vtmp);
2344 }
2345 
2346 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2347   vextractf128_high(vtmp1, src);
2348   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2349   unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2350 }
2351 
2352 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2353   vextractf64x4_high(vtmp2, src);
2354   unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2355   unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2356 }
2357 
2358 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2359   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2360   pshufd(vtmp, src, 0xE);
2361   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2362 }
2363 
2364 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2365   reduce2D(opcode, dst, src, vtmp2);
2366   vextractf128_high(vtmp2, src);
2367   reduce2D(opcode, dst, vtmp2, vtmp1);
2368 }
2369 
2370 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2371   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2372   vextracti64x4_high(vtmp1, src);
2373   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2374 }
2375 
2376 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2377   pshufd(dst, src, 0xE);
2378   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2379 }
2380 
2381 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2382   vextractf128_high(vtmp, src);
2383   unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2384   unorderedReduce2D(opcode, dst, vtmp);
2385 }
2386 
2387 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2388   vextractf64x4_high(vtmp2, src);
2389   unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2390   unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2391 }
2392 
2393 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2394   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2395 }
2396 
2397 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2398   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2399 }
2400 
2401 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2402   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2403 }
2404 
2405 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2406                                  int vec_enc) {
2407   switch(elem_bt) {
2408     case T_INT:
2409     case T_FLOAT:
2410       vmaskmovps(dst, src, mask, vec_enc);
2411       break;
2412     case T_LONG:
2413     case T_DOUBLE:
2414       vmaskmovpd(dst, src, mask, vec_enc);
2415       break;
2416     default:
2417       fatal("Unsupported type %s", type2name(elem_bt));
2418       break;
2419   }
2420 }
2421 
2422 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2423                                  int vec_enc) {
2424   switch(elem_bt) {
2425     case T_INT:
2426     case T_FLOAT:
2427       vmaskmovps(dst, src, mask, vec_enc);
2428       break;
2429     case T_LONG:
2430     case T_DOUBLE:
2431       vmaskmovpd(dst, src, mask, vec_enc);
2432       break;
2433     default:
2434       fatal("Unsupported type %s", type2name(elem_bt));
2435       break;
2436   }
2437 }
2438 
2439 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2440                                           XMMRegister dst, XMMRegister src,
2441                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2442                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2443   const int permconst[] = {1, 14};
2444   XMMRegister wsrc = src;
2445   XMMRegister wdst = xmm_0;
2446   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2447 
2448   int vlen_enc = Assembler::AVX_128bit;
2449   if (vlen == 16) {
2450     vlen_enc = Assembler::AVX_256bit;
2451   }
2452 
2453   for (int i = log2(vlen) - 1; i >=0; i--) {
2454     if (i == 0 && !is_dst_valid) {
2455       wdst = dst;
2456     }
2457     if (i == 3) {
2458       vextracti64x4_high(wtmp, wsrc);
2459     } else if (i == 2) {
2460       vextracti128_high(wtmp, wsrc);
2461     } else { // i = [0,1]
2462       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2463     }
2464     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2465     wsrc = wdst;
2466     vlen_enc = Assembler::AVX_128bit;
2467   }
2468   if (is_dst_valid) {
2469     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2470   }
2471 }
2472 
2473 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2474                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2475                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2476   XMMRegister wsrc = src;
2477   XMMRegister wdst = xmm_0;
2478   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2479   int vlen_enc = Assembler::AVX_128bit;
2480   if (vlen == 8) {
2481     vlen_enc = Assembler::AVX_256bit;
2482   }
2483   for (int i = log2(vlen) - 1; i >=0; i--) {
2484     if (i == 0 && !is_dst_valid) {
2485       wdst = dst;
2486     }
2487     if (i == 1) {
2488       vextracti128_high(wtmp, wsrc);
2489     } else if (i == 2) {
2490       vextracti64x4_high(wtmp, wsrc);
2491     } else {
2492       assert(i == 0, "%d", i);
2493       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2494     }
2495     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2496     wsrc = wdst;
2497     vlen_enc = Assembler::AVX_128bit;
2498   }
2499   if (is_dst_valid) {
2500     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2501   }
2502 }
2503 
2504 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2505   switch (bt) {
2506     case T_BYTE:  pextrb(dst, src, idx); break;
2507     case T_SHORT: pextrw(dst, src, idx); break;
2508     case T_INT:   pextrd(dst, src, idx); break;
2509     case T_LONG:  pextrq(dst, src, idx); break;
2510 
2511     default:
2512       assert(false,"Should not reach here.");
2513       break;
2514   }
2515 }
2516 
2517 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2518   int esize =  type2aelembytes(typ);
2519   int elem_per_lane = 16/esize;
2520   int lane = elemindex / elem_per_lane;
2521   int eindex = elemindex % elem_per_lane;
2522 
2523   if (lane >= 2) {
2524     assert(UseAVX > 2, "required");
2525     vextractf32x4(dst, src, lane & 3);
2526     return dst;
2527   } else if (lane > 0) {
2528     assert(UseAVX > 0, "required");
2529     vextractf128(dst, src, lane);
2530     return dst;
2531   } else {
2532     return src;
2533   }
2534 }
2535 
2536 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2537   if (typ == T_BYTE) {
2538     movsbl(dst, dst);
2539   } else if (typ == T_SHORT) {
2540     movswl(dst, dst);
2541   }
2542 }
2543 
2544 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2545   int esize =  type2aelembytes(typ);
2546   int elem_per_lane = 16/esize;
2547   int eindex = elemindex % elem_per_lane;
2548   assert(is_integral_type(typ),"required");
2549 
2550   if (eindex == 0) {
2551     if (typ == T_LONG) {
2552       movq(dst, src);
2553     } else {
2554       movdl(dst, src);
2555       movsxl(typ, dst);
2556     }
2557   } else {
2558     extract(typ, dst, src, eindex);
2559     movsxl(typ, dst);
2560   }
2561 }
2562 
2563 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2564   int esize =  type2aelembytes(typ);
2565   int elem_per_lane = 16/esize;
2566   int eindex = elemindex % elem_per_lane;
2567   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2568 
2569   if (eindex == 0) {
2570     movq(dst, src);
2571   } else {
2572     if (typ == T_FLOAT) {
2573       if (UseAVX == 0) {
2574         movdqu(dst, src);
2575         shufps(dst, dst, eindex);
2576       } else {
2577         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2578       }
2579     } else {
2580       if (UseAVX == 0) {
2581         movdqu(dst, src);
2582         psrldq(dst, eindex*esize);
2583       } else {
2584         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2585       }
2586       movq(dst, dst);
2587     }
2588   }
2589   // Zero upper bits
2590   if (typ == T_FLOAT) {
2591     if (UseAVX == 0) {
2592       assert(vtmp != xnoreg, "required.");
2593       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2594       pand(dst, vtmp);
2595     } else {
2596       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2597     }
2598   }
2599 }
2600 
2601 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2602   switch(typ) {
2603     case T_BYTE:
2604     case T_BOOLEAN:
2605       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2606       break;
2607     case T_SHORT:
2608     case T_CHAR:
2609       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2610       break;
2611     case T_INT:
2612     case T_FLOAT:
2613       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2614       break;
2615     case T_LONG:
2616     case T_DOUBLE:
2617       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2618       break;
2619     default:
2620       assert(false,"Should not reach here.");
2621       break;
2622   }
2623 }
2624 
2625 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2626   assert(rscratch != noreg || always_reachable(src2), "missing");
2627 
2628   switch(typ) {
2629     case T_BOOLEAN:
2630     case T_BYTE:
2631       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2632       break;
2633     case T_CHAR:
2634     case T_SHORT:
2635       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2636       break;
2637     case T_INT:
2638     case T_FLOAT:
2639       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2640       break;
2641     case T_LONG:
2642     case T_DOUBLE:
2643       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2644       break;
2645     default:
2646       assert(false,"Should not reach here.");
2647       break;
2648   }
2649 }
2650 
2651 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2652   switch(typ) {
2653     case T_BYTE:
2654       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2655       break;
2656     case T_SHORT:
2657       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2658       break;
2659     case T_INT:
2660     case T_FLOAT:
2661       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2662       break;
2663     case T_LONG:
2664     case T_DOUBLE:
2665       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2666       break;
2667     default:
2668       assert(false,"Should not reach here.");
2669       break;
2670   }
2671 }
2672 
2673 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2674   assert(vlen_in_bytes <= 32, "");
2675   int esize = type2aelembytes(bt);
2676   if (vlen_in_bytes == 32) {
2677     assert(vtmp == xnoreg, "required.");
2678     if (esize >= 4) {
2679       vtestps(src1, src2, AVX_256bit);
2680     } else {
2681       vptest(src1, src2, AVX_256bit);
2682     }
2683     return;
2684   }
2685   if (vlen_in_bytes < 16) {
2686     // Duplicate the lower part to fill the whole register,
2687     // Don't need to do so for src2
2688     assert(vtmp != xnoreg, "required");
2689     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2690     pshufd(vtmp, src1, shuffle_imm);
2691   } else {
2692     assert(vtmp == xnoreg, "required");
2693     vtmp = src1;
2694   }
2695   if (esize >= 4 && VM_Version::supports_avx()) {
2696     vtestps(vtmp, src2, AVX_128bit);
2697   } else {
2698     ptest(vtmp, src2);
2699   }
2700 }
2701 
2702 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2703 #ifdef ASSERT
2704   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2705   bool is_bw_supported = VM_Version::supports_avx512bw();
2706   if (is_bw && !is_bw_supported) {
2707     assert(vlen_enc != Assembler::AVX_512bit, "required");
2708     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2709            "XMM register should be 0-15");
2710   }
2711 #endif // ASSERT
2712   switch (elem_bt) {
2713     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2714     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2715     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2716     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2717     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2718     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2719     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2720   }
2721 }
2722 
2723 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2724   assert(UseAVX >= 2, "required");
2725   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2726   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2727   if ((UseAVX > 2) &&
2728       (!is_bw || VM_Version::supports_avx512bw()) &&
2729       (!is_vl || VM_Version::supports_avx512vl())) {
2730     switch (elem_bt) {
2731       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2732       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2733       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2734       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2735       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2736     }
2737   } else {
2738     assert(vlen_enc != Assembler::AVX_512bit, "required");
2739     assert((dst->encoding() < 16),"XMM register should be 0-15");
2740     switch (elem_bt) {
2741       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2742       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2743       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2744       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2745       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2746       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2747       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2748     }
2749   }
2750 }
2751 
2752 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2753   switch (to_elem_bt) {
2754     case T_SHORT:
2755       vpmovsxbw(dst, src, vlen_enc);
2756       break;
2757     case T_INT:
2758       vpmovsxbd(dst, src, vlen_enc);
2759       break;
2760     case T_FLOAT:
2761       vpmovsxbd(dst, src, vlen_enc);
2762       vcvtdq2ps(dst, dst, vlen_enc);
2763       break;
2764     case T_LONG:
2765       vpmovsxbq(dst, src, vlen_enc);
2766       break;
2767     case T_DOUBLE: {
2768       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2769       vpmovsxbd(dst, src, mid_vlen_enc);
2770       vcvtdq2pd(dst, dst, vlen_enc);
2771       break;
2772     }
2773     default:
2774       fatal("Unsupported type %s", type2name(to_elem_bt));
2775       break;
2776   }
2777 }
2778 
2779 //-------------------------------------------------------------------------------------------
2780 
2781 // IndexOf for constant substrings with size >= 8 chars
2782 // which don't need to be loaded through stack.
2783 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2784                                          Register cnt1, Register cnt2,
2785                                          int int_cnt2,  Register result,
2786                                          XMMRegister vec, Register tmp,
2787                                          int ae) {
2788   ShortBranchVerifier sbv(this);
2789   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2790   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2791 
2792   // This method uses the pcmpestri instruction with bound registers
2793   //   inputs:
2794   //     xmm - substring
2795   //     rax - substring length (elements count)
2796   //     mem - scanned string
2797   //     rdx - string length (elements count)
2798   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2799   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2800   //   outputs:
2801   //     rcx - matched index in string
2802   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2803   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2804   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2805   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2806   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2807 
2808   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2809         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2810         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2811 
2812   // Note, inline_string_indexOf() generates checks:
2813   // if (substr.count > string.count) return -1;
2814   // if (substr.count == 0) return 0;
2815   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2816 
2817   // Load substring.
2818   if (ae == StrIntrinsicNode::UL) {
2819     pmovzxbw(vec, Address(str2, 0));
2820   } else {
2821     movdqu(vec, Address(str2, 0));
2822   }
2823   movl(cnt2, int_cnt2);
2824   movptr(result, str1); // string addr
2825 
2826   if (int_cnt2 > stride) {
2827     jmpb(SCAN_TO_SUBSTR);
2828 
2829     // Reload substr for rescan, this code
2830     // is executed only for large substrings (> 8 chars)
2831     bind(RELOAD_SUBSTR);
2832     if (ae == StrIntrinsicNode::UL) {
2833       pmovzxbw(vec, Address(str2, 0));
2834     } else {
2835       movdqu(vec, Address(str2, 0));
2836     }
2837     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2838 
2839     bind(RELOAD_STR);
2840     // We came here after the beginning of the substring was
2841     // matched but the rest of it was not so we need to search
2842     // again. Start from the next element after the previous match.
2843 
2844     // cnt2 is number of substring reminding elements and
2845     // cnt1 is number of string reminding elements when cmp failed.
2846     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2847     subl(cnt1, cnt2);
2848     addl(cnt1, int_cnt2);
2849     movl(cnt2, int_cnt2); // Now restore cnt2
2850 
2851     decrementl(cnt1);     // Shift to next element
2852     cmpl(cnt1, cnt2);
2853     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2854 
2855     addptr(result, (1<<scale1));
2856 
2857   } // (int_cnt2 > 8)
2858 
2859   // Scan string for start of substr in 16-byte vectors
2860   bind(SCAN_TO_SUBSTR);
2861   pcmpestri(vec, Address(result, 0), mode);
2862   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2863   subl(cnt1, stride);
2864   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2865   cmpl(cnt1, cnt2);
2866   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2867   addptr(result, 16);
2868   jmpb(SCAN_TO_SUBSTR);
2869 
2870   // Found a potential substr
2871   bind(FOUND_CANDIDATE);
2872   // Matched whole vector if first element matched (tmp(rcx) == 0).
2873   if (int_cnt2 == stride) {
2874     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2875   } else { // int_cnt2 > 8
2876     jccb(Assembler::overflow, FOUND_SUBSTR);
2877   }
2878   // After pcmpestri tmp(rcx) contains matched element index
2879   // Compute start addr of substr
2880   lea(result, Address(result, tmp, scale1));
2881 
2882   // Make sure string is still long enough
2883   subl(cnt1, tmp);
2884   cmpl(cnt1, cnt2);
2885   if (int_cnt2 == stride) {
2886     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2887   } else { // int_cnt2 > 8
2888     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2889   }
2890   // Left less then substring.
2891 
2892   bind(RET_NOT_FOUND);
2893   movl(result, -1);
2894   jmp(EXIT);
2895 
2896   if (int_cnt2 > stride) {
2897     // This code is optimized for the case when whole substring
2898     // is matched if its head is matched.
2899     bind(MATCH_SUBSTR_HEAD);
2900     pcmpestri(vec, Address(result, 0), mode);
2901     // Reload only string if does not match
2902     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2903 
2904     Label CONT_SCAN_SUBSTR;
2905     // Compare the rest of substring (> 8 chars).
2906     bind(FOUND_SUBSTR);
2907     // First 8 chars are already matched.
2908     negptr(cnt2);
2909     addptr(cnt2, stride);
2910 
2911     bind(SCAN_SUBSTR);
2912     subl(cnt1, stride);
2913     cmpl(cnt2, -stride); // Do not read beyond substring
2914     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2915     // Back-up strings to avoid reading beyond substring:
2916     // cnt1 = cnt1 - cnt2 + 8
2917     addl(cnt1, cnt2); // cnt2 is negative
2918     addl(cnt1, stride);
2919     movl(cnt2, stride); negptr(cnt2);
2920     bind(CONT_SCAN_SUBSTR);
2921     if (int_cnt2 < (int)G) {
2922       int tail_off1 = int_cnt2<<scale1;
2923       int tail_off2 = int_cnt2<<scale2;
2924       if (ae == StrIntrinsicNode::UL) {
2925         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2926       } else {
2927         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2928       }
2929       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2930     } else {
2931       // calculate index in register to avoid integer overflow (int_cnt2*2)
2932       movl(tmp, int_cnt2);
2933       addptr(tmp, cnt2);
2934       if (ae == StrIntrinsicNode::UL) {
2935         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2936       } else {
2937         movdqu(vec, Address(str2, tmp, scale2, 0));
2938       }
2939       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2940     }
2941     // Need to reload strings pointers if not matched whole vector
2942     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2943     addptr(cnt2, stride);
2944     jcc(Assembler::negative, SCAN_SUBSTR);
2945     // Fall through if found full substring
2946 
2947   } // (int_cnt2 > 8)
2948 
2949   bind(RET_FOUND);
2950   // Found result if we matched full small substring.
2951   // Compute substr offset
2952   subptr(result, str1);
2953   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2954     shrl(result, 1); // index
2955   }
2956   bind(EXIT);
2957 
2958 } // string_indexofC8
2959 
2960 // Small strings are loaded through stack if they cross page boundary.
2961 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2962                                        Register cnt1, Register cnt2,
2963                                        int int_cnt2,  Register result,
2964                                        XMMRegister vec, Register tmp,
2965                                        int ae) {
2966   ShortBranchVerifier sbv(this);
2967   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2968   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2969 
2970   //
2971   // int_cnt2 is length of small (< 8 chars) constant substring
2972   // or (-1) for non constant substring in which case its length
2973   // is in cnt2 register.
2974   //
2975   // Note, inline_string_indexOf() generates checks:
2976   // if (substr.count > string.count) return -1;
2977   // if (substr.count == 0) return 0;
2978   //
2979   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2980   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2981   // This method uses the pcmpestri instruction with bound registers
2982   //   inputs:
2983   //     xmm - substring
2984   //     rax - substring length (elements count)
2985   //     mem - scanned string
2986   //     rdx - string length (elements count)
2987   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2988   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2989   //   outputs:
2990   //     rcx - matched index in string
2991   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2992   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2993   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2994   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2995 
2996   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2997         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2998         FOUND_CANDIDATE;
2999 
3000   { //========================================================
3001     // We don't know where these strings are located
3002     // and we can't read beyond them. Load them through stack.
3003     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
3004 
3005     movptr(tmp, rsp); // save old SP
3006 
3007     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
3008       if (int_cnt2 == (1>>scale2)) { // One byte
3009         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
3010         load_unsigned_byte(result, Address(str2, 0));
3011         movdl(vec, result); // move 32 bits
3012       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
3013         // Not enough header space in 32-bit VM: 12+3 = 15.
3014         movl(result, Address(str2, -1));
3015         shrl(result, 8);
3016         movdl(vec, result); // move 32 bits
3017       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
3018         load_unsigned_short(result, Address(str2, 0));
3019         movdl(vec, result); // move 32 bits
3020       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
3021         movdl(vec, Address(str2, 0)); // move 32 bits
3022       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
3023         movq(vec, Address(str2, 0));  // move 64 bits
3024       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
3025         // Array header size is 12 bytes in 32-bit VM
3026         // + 6 bytes for 3 chars == 18 bytes,
3027         // enough space to load vec and shift.
3028         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
3029         if (ae == StrIntrinsicNode::UL) {
3030           int tail_off = int_cnt2-8;
3031           pmovzxbw(vec, Address(str2, tail_off));
3032           psrldq(vec, -2*tail_off);
3033         }
3034         else {
3035           int tail_off = int_cnt2*(1<<scale2);
3036           movdqu(vec, Address(str2, tail_off-16));
3037           psrldq(vec, 16-tail_off);
3038         }
3039       }
3040     } else { // not constant substring
3041       cmpl(cnt2, stride);
3042       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3043 
3044       // We can read beyond string if srt+16 does not cross page boundary
3045       // since heaps are aligned and mapped by pages.
3046       assert(os::vm_page_size() < (int)G, "default page should be small");
3047       movl(result, str2); // We need only low 32 bits
3048       andl(result, ((int)os::vm_page_size()-1));
3049       cmpl(result, ((int)os::vm_page_size()-16));
3050       jccb(Assembler::belowEqual, CHECK_STR);
3051 
3052       // Move small strings to stack to allow load 16 bytes into vec.
3053       subptr(rsp, 16);
3054       int stk_offset = wordSize-(1<<scale2);
3055       push(cnt2);
3056 
3057       bind(COPY_SUBSTR);
3058       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3059         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3060         movb(Address(rsp, cnt2, scale2, stk_offset), result);
3061       } else if (ae == StrIntrinsicNode::UU) {
3062         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3063         movw(Address(rsp, cnt2, scale2, stk_offset), result);
3064       }
3065       decrement(cnt2);
3066       jccb(Assembler::notZero, COPY_SUBSTR);
3067 
3068       pop(cnt2);
3069       movptr(str2, rsp);  // New substring address
3070     } // non constant
3071 
3072     bind(CHECK_STR);
3073     cmpl(cnt1, stride);
3074     jccb(Assembler::aboveEqual, BIG_STRINGS);
3075 
3076     // Check cross page boundary.
3077     movl(result, str1); // We need only low 32 bits
3078     andl(result, ((int)os::vm_page_size()-1));
3079     cmpl(result, ((int)os::vm_page_size()-16));
3080     jccb(Assembler::belowEqual, BIG_STRINGS);
3081 
3082     subptr(rsp, 16);
3083     int stk_offset = -(1<<scale1);
3084     if (int_cnt2 < 0) { // not constant
3085       push(cnt2);
3086       stk_offset += wordSize;
3087     }
3088     movl(cnt2, cnt1);
3089 
3090     bind(COPY_STR);
3091     if (ae == StrIntrinsicNode::LL) {
3092       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3093       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3094     } else {
3095       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3096       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3097     }
3098     decrement(cnt2);
3099     jccb(Assembler::notZero, COPY_STR);
3100 
3101     if (int_cnt2 < 0) { // not constant
3102       pop(cnt2);
3103     }
3104     movptr(str1, rsp);  // New string address
3105 
3106     bind(BIG_STRINGS);
3107     // Load substring.
3108     if (int_cnt2 < 0) { // -1
3109       if (ae == StrIntrinsicNode::UL) {
3110         pmovzxbw(vec, Address(str2, 0));
3111       } else {
3112         movdqu(vec, Address(str2, 0));
3113       }
3114       push(cnt2);       // substr count
3115       push(str2);       // substr addr
3116       push(str1);       // string addr
3117     } else {
3118       // Small (< 8 chars) constant substrings are loaded already.
3119       movl(cnt2, int_cnt2);
3120     }
3121     push(tmp);  // original SP
3122 
3123   } // Finished loading
3124 
3125   //========================================================
3126   // Start search
3127   //
3128 
3129   movptr(result, str1); // string addr
3130 
3131   if (int_cnt2  < 0) {  // Only for non constant substring
3132     jmpb(SCAN_TO_SUBSTR);
3133 
3134     // SP saved at sp+0
3135     // String saved at sp+1*wordSize
3136     // Substr saved at sp+2*wordSize
3137     // Substr count saved at sp+3*wordSize
3138 
3139     // Reload substr for rescan, this code
3140     // is executed only for large substrings (> 8 chars)
3141     bind(RELOAD_SUBSTR);
3142     movptr(str2, Address(rsp, 2*wordSize));
3143     movl(cnt2, Address(rsp, 3*wordSize));
3144     if (ae == StrIntrinsicNode::UL) {
3145       pmovzxbw(vec, Address(str2, 0));
3146     } else {
3147       movdqu(vec, Address(str2, 0));
3148     }
3149     // We came here after the beginning of the substring was
3150     // matched but the rest of it was not so we need to search
3151     // again. Start from the next element after the previous match.
3152     subptr(str1, result); // Restore counter
3153     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3154       shrl(str1, 1);
3155     }
3156     addl(cnt1, str1);
3157     decrementl(cnt1);   // Shift to next element
3158     cmpl(cnt1, cnt2);
3159     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3160 
3161     addptr(result, (1<<scale1));
3162   } // non constant
3163 
3164   // Scan string for start of substr in 16-byte vectors
3165   bind(SCAN_TO_SUBSTR);
3166   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3167   pcmpestri(vec, Address(result, 0), mode);
3168   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3169   subl(cnt1, stride);
3170   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3171   cmpl(cnt1, cnt2);
3172   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3173   addptr(result, 16);
3174 
3175   bind(ADJUST_STR);
3176   cmpl(cnt1, stride); // Do not read beyond string
3177   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3178   // Back-up string to avoid reading beyond string.
3179   lea(result, Address(result, cnt1, scale1, -16));
3180   movl(cnt1, stride);
3181   jmpb(SCAN_TO_SUBSTR);
3182 
3183   // Found a potential substr
3184   bind(FOUND_CANDIDATE);
3185   // After pcmpestri tmp(rcx) contains matched element index
3186 
3187   // Make sure string is still long enough
3188   subl(cnt1, tmp);
3189   cmpl(cnt1, cnt2);
3190   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3191   // Left less then substring.
3192 
3193   bind(RET_NOT_FOUND);
3194   movl(result, -1);
3195   jmp(CLEANUP);
3196 
3197   bind(FOUND_SUBSTR);
3198   // Compute start addr of substr
3199   lea(result, Address(result, tmp, scale1));
3200   if (int_cnt2 > 0) { // Constant substring
3201     // Repeat search for small substring (< 8 chars)
3202     // from new point without reloading substring.
3203     // Have to check that we don't read beyond string.
3204     cmpl(tmp, stride-int_cnt2);
3205     jccb(Assembler::greater, ADJUST_STR);
3206     // Fall through if matched whole substring.
3207   } else { // non constant
3208     assert(int_cnt2 == -1, "should be != 0");
3209 
3210     addl(tmp, cnt2);
3211     // Found result if we matched whole substring.
3212     cmpl(tmp, stride);
3213     jcc(Assembler::lessEqual, RET_FOUND);
3214 
3215     // Repeat search for small substring (<= 8 chars)
3216     // from new point 'str1' without reloading substring.
3217     cmpl(cnt2, stride);
3218     // Have to check that we don't read beyond string.
3219     jccb(Assembler::lessEqual, ADJUST_STR);
3220 
3221     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3222     // Compare the rest of substring (> 8 chars).
3223     movptr(str1, result);
3224 
3225     cmpl(tmp, cnt2);
3226     // First 8 chars are already matched.
3227     jccb(Assembler::equal, CHECK_NEXT);
3228 
3229     bind(SCAN_SUBSTR);
3230     pcmpestri(vec, Address(str1, 0), mode);
3231     // Need to reload strings pointers if not matched whole vector
3232     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3233 
3234     bind(CHECK_NEXT);
3235     subl(cnt2, stride);
3236     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3237     addptr(str1, 16);
3238     if (ae == StrIntrinsicNode::UL) {
3239       addptr(str2, 8);
3240     } else {
3241       addptr(str2, 16);
3242     }
3243     subl(cnt1, stride);
3244     cmpl(cnt2, stride); // Do not read beyond substring
3245     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3246     // Back-up strings to avoid reading beyond substring.
3247 
3248     if (ae == StrIntrinsicNode::UL) {
3249       lea(str2, Address(str2, cnt2, scale2, -8));
3250       lea(str1, Address(str1, cnt2, scale1, -16));
3251     } else {
3252       lea(str2, Address(str2, cnt2, scale2, -16));
3253       lea(str1, Address(str1, cnt2, scale1, -16));
3254     }
3255     subl(cnt1, cnt2);
3256     movl(cnt2, stride);
3257     addl(cnt1, stride);
3258     bind(CONT_SCAN_SUBSTR);
3259     if (ae == StrIntrinsicNode::UL) {
3260       pmovzxbw(vec, Address(str2, 0));
3261     } else {
3262       movdqu(vec, Address(str2, 0));
3263     }
3264     jmp(SCAN_SUBSTR);
3265 
3266     bind(RET_FOUND_LONG);
3267     movptr(str1, Address(rsp, wordSize));
3268   } // non constant
3269 
3270   bind(RET_FOUND);
3271   // Compute substr offset
3272   subptr(result, str1);
3273   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3274     shrl(result, 1); // index
3275   }
3276   bind(CLEANUP);
3277   pop(rsp); // restore SP
3278 
3279 } // string_indexof
3280 
3281 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3282                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3283   ShortBranchVerifier sbv(this);
3284   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3285 
3286   int stride = 8;
3287 
3288   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3289         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3290         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3291         FOUND_SEQ_CHAR, DONE_LABEL;
3292 
3293   movptr(result, str1);
3294   if (UseAVX >= 2) {
3295     cmpl(cnt1, stride);
3296     jcc(Assembler::less, SCAN_TO_CHAR);
3297     cmpl(cnt1, 2*stride);
3298     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3299     movdl(vec1, ch);
3300     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3301     vpxor(vec2, vec2);
3302     movl(tmp, cnt1);
3303     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3304     andl(cnt1,0x0000000F);  //tail count (in chars)
3305 
3306     bind(SCAN_TO_16_CHAR_LOOP);
3307     vmovdqu(vec3, Address(result, 0));
3308     vpcmpeqw(vec3, vec3, vec1, 1);
3309     vptest(vec2, vec3);
3310     jcc(Assembler::carryClear, FOUND_CHAR);
3311     addptr(result, 32);
3312     subl(tmp, 2*stride);
3313     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3314     jmp(SCAN_TO_8_CHAR);
3315     bind(SCAN_TO_8_CHAR_INIT);
3316     movdl(vec1, ch);
3317     pshuflw(vec1, vec1, 0x00);
3318     pshufd(vec1, vec1, 0);
3319     pxor(vec2, vec2);
3320   }
3321   bind(SCAN_TO_8_CHAR);
3322   cmpl(cnt1, stride);
3323   jcc(Assembler::less, SCAN_TO_CHAR);
3324   if (UseAVX < 2) {
3325     movdl(vec1, ch);
3326     pshuflw(vec1, vec1, 0x00);
3327     pshufd(vec1, vec1, 0);
3328     pxor(vec2, vec2);
3329   }
3330   movl(tmp, cnt1);
3331   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3332   andl(cnt1,0x00000007);  //tail count (in chars)
3333 
3334   bind(SCAN_TO_8_CHAR_LOOP);
3335   movdqu(vec3, Address(result, 0));
3336   pcmpeqw(vec3, vec1);
3337   ptest(vec2, vec3);
3338   jcc(Assembler::carryClear, FOUND_CHAR);
3339   addptr(result, 16);
3340   subl(tmp, stride);
3341   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3342   bind(SCAN_TO_CHAR);
3343   testl(cnt1, cnt1);
3344   jcc(Assembler::zero, RET_NOT_FOUND);
3345   bind(SCAN_TO_CHAR_LOOP);
3346   load_unsigned_short(tmp, Address(result, 0));
3347   cmpl(ch, tmp);
3348   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3349   addptr(result, 2);
3350   subl(cnt1, 1);
3351   jccb(Assembler::zero, RET_NOT_FOUND);
3352   jmp(SCAN_TO_CHAR_LOOP);
3353 
3354   bind(RET_NOT_FOUND);
3355   movl(result, -1);
3356   jmpb(DONE_LABEL);
3357 
3358   bind(FOUND_CHAR);
3359   if (UseAVX >= 2) {
3360     vpmovmskb(tmp, vec3);
3361   } else {
3362     pmovmskb(tmp, vec3);
3363   }
3364   bsfl(ch, tmp);
3365   addptr(result, ch);
3366 
3367   bind(FOUND_SEQ_CHAR);
3368   subptr(result, str1);
3369   shrl(result, 1);
3370 
3371   bind(DONE_LABEL);
3372 } // string_indexof_char
3373 
3374 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3375                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3376   ShortBranchVerifier sbv(this);
3377   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3378 
3379   int stride = 16;
3380 
3381   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3382         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3383         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3384         FOUND_SEQ_CHAR, DONE_LABEL;
3385 
3386   movptr(result, str1);
3387   if (UseAVX >= 2) {
3388     cmpl(cnt1, stride);
3389     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3390     cmpl(cnt1, stride*2);
3391     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3392     movdl(vec1, ch);
3393     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3394     vpxor(vec2, vec2);
3395     movl(tmp, cnt1);
3396     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3397     andl(cnt1,0x0000001F);  //tail count (in chars)
3398 
3399     bind(SCAN_TO_32_CHAR_LOOP);
3400     vmovdqu(vec3, Address(result, 0));
3401     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3402     vptest(vec2, vec3);
3403     jcc(Assembler::carryClear, FOUND_CHAR);
3404     addptr(result, 32);
3405     subl(tmp, stride*2);
3406     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3407     jmp(SCAN_TO_16_CHAR);
3408 
3409     bind(SCAN_TO_16_CHAR_INIT);
3410     movdl(vec1, ch);
3411     pxor(vec2, vec2);
3412     pshufb(vec1, vec2);
3413   }
3414 
3415   bind(SCAN_TO_16_CHAR);
3416   cmpl(cnt1, stride);
3417   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3418   if (UseAVX < 2) {
3419     movdl(vec1, ch);
3420     pxor(vec2, vec2);
3421     pshufb(vec1, vec2);
3422   }
3423   movl(tmp, cnt1);
3424   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3425   andl(cnt1,0x0000000F);  //tail count (in bytes)
3426 
3427   bind(SCAN_TO_16_CHAR_LOOP);
3428   movdqu(vec3, Address(result, 0));
3429   pcmpeqb(vec3, vec1);
3430   ptest(vec2, vec3);
3431   jcc(Assembler::carryClear, FOUND_CHAR);
3432   addptr(result, 16);
3433   subl(tmp, stride);
3434   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3435 
3436   bind(SCAN_TO_CHAR_INIT);
3437   testl(cnt1, cnt1);
3438   jcc(Assembler::zero, RET_NOT_FOUND);
3439   bind(SCAN_TO_CHAR_LOOP);
3440   load_unsigned_byte(tmp, Address(result, 0));
3441   cmpl(ch, tmp);
3442   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3443   addptr(result, 1);
3444   subl(cnt1, 1);
3445   jccb(Assembler::zero, RET_NOT_FOUND);
3446   jmp(SCAN_TO_CHAR_LOOP);
3447 
3448   bind(RET_NOT_FOUND);
3449   movl(result, -1);
3450   jmpb(DONE_LABEL);
3451 
3452   bind(FOUND_CHAR);
3453   if (UseAVX >= 2) {
3454     vpmovmskb(tmp, vec3);
3455   } else {
3456     pmovmskb(tmp, vec3);
3457   }
3458   bsfl(ch, tmp);
3459   addptr(result, ch);
3460 
3461   bind(FOUND_SEQ_CHAR);
3462   subptr(result, str1);
3463 
3464   bind(DONE_LABEL);
3465 } // stringL_indexof_char
3466 
3467 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3468   switch (eltype) {
3469   case T_BOOLEAN: return sizeof(jboolean);
3470   case T_BYTE:  return sizeof(jbyte);
3471   case T_SHORT: return sizeof(jshort);
3472   case T_CHAR:  return sizeof(jchar);
3473   case T_INT:   return sizeof(jint);
3474   default:
3475     ShouldNotReachHere();
3476     return -1;
3477   }
3478 }
3479 
3480 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3481   switch (eltype) {
3482   // T_BOOLEAN used as surrogate for unsigned byte
3483   case T_BOOLEAN: movzbl(dst, src);   break;
3484   case T_BYTE:    movsbl(dst, src);   break;
3485   case T_SHORT:   movswl(dst, src);   break;
3486   case T_CHAR:    movzwl(dst, src);   break;
3487   case T_INT:     movl(dst, src);     break;
3488   default:
3489     ShouldNotReachHere();
3490   }
3491 }
3492 
3493 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3494   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3495 }
3496 
3497 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3498   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3499 }
3500 
3501 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3502   const int vlen = Assembler::AVX_256bit;
3503   switch (eltype) {
3504   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3505   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3506   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3507   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3508   case T_INT:
3509     // do nothing
3510     break;
3511   default:
3512     ShouldNotReachHere();
3513   }
3514 }
3515 
3516 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3517                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3518                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3519                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3520                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3521                                         BasicType eltype) {
3522   ShortBranchVerifier sbv(this);
3523   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3524   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3525   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3526 
3527   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3528         SHORT_UNROLLED_LOOP_EXIT,
3529         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3530         UNROLLED_VECTOR_LOOP_BEGIN,
3531         END;
3532   switch (eltype) {
3533   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3534   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3535   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3536   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3537   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3538   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3539   }
3540 
3541   // For "renaming" for readibility of the code
3542   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3543                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3544                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3545 
3546   const int elsize = arrays_hashcode_elsize(eltype);
3547 
3548   /*
3549     if (cnt1 >= 2) {
3550       if (cnt1 >= 32) {
3551         UNROLLED VECTOR LOOP
3552       }
3553       UNROLLED SCALAR LOOP
3554     }
3555     SINGLE SCALAR
3556    */
3557 
3558   cmpl(cnt1, 32);
3559   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3560 
3561   // cnt1 >= 32 && generate_vectorized_loop
3562   xorl(index, index);
3563 
3564   // vresult = IntVector.zero(I256);
3565   for (int idx = 0; idx < 4; idx++) {
3566     vpxor(vresult[idx], vresult[idx]);
3567   }
3568   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3569   Register bound = tmp2;
3570   Register next = tmp3;
3571   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3572   movl(next, Address(tmp2, 0));
3573   movdl(vnext, next);
3574   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3575 
3576   // index = 0;
3577   // bound = cnt1 & ~(32 - 1);
3578   movl(bound, cnt1);
3579   andl(bound, ~(32 - 1));
3580   // for (; index < bound; index += 32) {
3581   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3582   // result *= next;
3583   imull(result, next);
3584   // loop fission to upfront the cost of fetching from memory, OOO execution
3585   // can then hopefully do a better job of prefetching
3586   for (int idx = 0; idx < 4; idx++) {
3587     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3588   }
3589   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3590   for (int idx = 0; idx < 4; idx++) {
3591     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3592     arrays_hashcode_elvcast(vtmp[idx], eltype);
3593     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3594   }
3595   // index += 32;
3596   addl(index, 32);
3597   // index < bound;
3598   cmpl(index, bound);
3599   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3600   // }
3601 
3602   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3603   subl(cnt1, bound);
3604   // release bound
3605 
3606   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3607   for (int idx = 0; idx < 4; idx++) {
3608     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3609     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3610     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3611   }
3612   // result += vresult.reduceLanes(ADD);
3613   for (int idx = 0; idx < 4; idx++) {
3614     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3615   }
3616 
3617   // } else if (cnt1 < 32) {
3618 
3619   bind(SHORT_UNROLLED_BEGIN);
3620   // int i = 1;
3621   movl(index, 1);
3622   cmpl(index, cnt1);
3623   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3624 
3625   // for (; i < cnt1 ; i += 2) {
3626   bind(SHORT_UNROLLED_LOOP_BEGIN);
3627   movl(tmp3, 961);
3628   imull(result, tmp3);
3629   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3630   movl(tmp3, tmp2);
3631   shll(tmp3, 5);
3632   subl(tmp3, tmp2);
3633   addl(result, tmp3);
3634   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3635   addl(result, tmp3);
3636   addl(index, 2);
3637   cmpl(index, cnt1);
3638   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3639 
3640   // }
3641   // if (i >= cnt1) {
3642   bind(SHORT_UNROLLED_LOOP_EXIT);
3643   jccb(Assembler::greater, END);
3644   movl(tmp2, result);
3645   shll(result, 5);
3646   subl(result, tmp2);
3647   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3648   addl(result, tmp3);
3649   // }
3650   bind(END);
3651 
3652   BLOCK_COMMENT("} // arrays_hashcode");
3653 
3654 } // arrays_hashcode
3655 
3656 // helper function for string_compare
3657 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3658                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3659                                            Address::ScaleFactor scale2, Register index, int ae) {
3660   if (ae == StrIntrinsicNode::LL) {
3661     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3662     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3663   } else if (ae == StrIntrinsicNode::UU) {
3664     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3665     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3666   } else {
3667     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3668     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3669   }
3670 }
3671 
3672 // Compare strings, used for char[] and byte[].
3673 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3674                                        Register cnt1, Register cnt2, Register result,
3675                                        XMMRegister vec1, int ae, KRegister mask) {
3676   ShortBranchVerifier sbv(this);
3677   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3678   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only AVX3
3679   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3680   int stride2x2 = 0x40;
3681   Address::ScaleFactor scale = Address::no_scale;
3682   Address::ScaleFactor scale1 = Address::no_scale;
3683   Address::ScaleFactor scale2 = Address::no_scale;
3684 
3685   if (ae != StrIntrinsicNode::LL) {
3686     stride2x2 = 0x20;
3687   }
3688 
3689   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3690     shrl(cnt2, 1);
3691   }
3692   // Compute the minimum of the string lengths and the
3693   // difference of the string lengths (stack).
3694   // Do the conditional move stuff
3695   movl(result, cnt1);
3696   subl(cnt1, cnt2);
3697   push(cnt1);
3698   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3699 
3700   // Is the minimum length zero?
3701   testl(cnt2, cnt2);
3702   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3703   if (ae == StrIntrinsicNode::LL) {
3704     // Load first bytes
3705     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3706     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3707   } else if (ae == StrIntrinsicNode::UU) {
3708     // Load first characters
3709     load_unsigned_short(result, Address(str1, 0));
3710     load_unsigned_short(cnt1, Address(str2, 0));
3711   } else {
3712     load_unsigned_byte(result, Address(str1, 0));
3713     load_unsigned_short(cnt1, Address(str2, 0));
3714   }
3715   subl(result, cnt1);
3716   jcc(Assembler::notZero,  POP_LABEL);
3717 
3718   if (ae == StrIntrinsicNode::UU) {
3719     // Divide length by 2 to get number of chars
3720     shrl(cnt2, 1);
3721   }
3722   cmpl(cnt2, 1);
3723   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3724 
3725   // Check if the strings start at the same location and setup scale and stride
3726   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3727     cmpptr(str1, str2);
3728     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3729     if (ae == StrIntrinsicNode::LL) {
3730       scale = Address::times_1;
3731       stride = 16;
3732     } else {
3733       scale = Address::times_2;
3734       stride = 8;
3735     }
3736   } else {
3737     scale1 = Address::times_1;
3738     scale2 = Address::times_2;
3739     // scale not used
3740     stride = 8;
3741   }
3742 
3743   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3744     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3745     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3746     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3747     Label COMPARE_TAIL_LONG;
3748     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only AVX3
3749 
3750     int pcmpmask = 0x19;
3751     if (ae == StrIntrinsicNode::LL) {
3752       pcmpmask &= ~0x01;
3753     }
3754 
3755     // Setup to compare 16-chars (32-bytes) vectors,
3756     // start from first character again because it has aligned address.
3757     if (ae == StrIntrinsicNode::LL) {
3758       stride2 = 32;
3759     } else {
3760       stride2 = 16;
3761     }
3762     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3763       adr_stride = stride << scale;
3764     } else {
3765       adr_stride1 = 8;  //stride << scale1;
3766       adr_stride2 = 16; //stride << scale2;
3767     }
3768 
3769     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3770     // rax and rdx are used by pcmpestri as elements counters
3771     movl(result, cnt2);
3772     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3773     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3774 
3775     // fast path : compare first 2 8-char vectors.
3776     bind(COMPARE_16_CHARS);
3777     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3778       movdqu(vec1, Address(str1, 0));
3779     } else {
3780       pmovzxbw(vec1, Address(str1, 0));
3781     }
3782     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3783     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3784 
3785     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3786       movdqu(vec1, Address(str1, adr_stride));
3787       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3788     } else {
3789       pmovzxbw(vec1, Address(str1, adr_stride1));
3790       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3791     }
3792     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3793     addl(cnt1, stride);
3794 
3795     // Compare the characters at index in cnt1
3796     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3797     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3798     subl(result, cnt2);
3799     jmp(POP_LABEL);
3800 
3801     // Setup the registers to start vector comparison loop
3802     bind(COMPARE_WIDE_VECTORS);
3803     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3804       lea(str1, Address(str1, result, scale));
3805       lea(str2, Address(str2, result, scale));
3806     } else {
3807       lea(str1, Address(str1, result, scale1));
3808       lea(str2, Address(str2, result, scale2));
3809     }
3810     subl(result, stride2);
3811     subl(cnt2, stride2);
3812     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3813     negptr(result);
3814 
3815     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3816     bind(COMPARE_WIDE_VECTORS_LOOP);
3817 
3818     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3819       cmpl(cnt2, stride2x2);
3820       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3821       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3822       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3823 
3824       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3825       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3826         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3827         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3828       } else {
3829         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3830         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3831       }
3832       kortestql(mask, mask);
3833       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3834       addptr(result, stride2x2);  // update since we already compared at this addr
3835       subl(cnt2, stride2x2);      // and sub the size too
3836       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3837 
3838       vpxor(vec1, vec1);
3839       jmpb(COMPARE_WIDE_TAIL);
3840     }//if (VM_Version::supports_avx512vlbw())
3841 
3842     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3843     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3844       vmovdqu(vec1, Address(str1, result, scale));
3845       vpxor(vec1, Address(str2, result, scale));
3846     } else {
3847       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3848       vpxor(vec1, Address(str2, result, scale2));
3849     }
3850     vptest(vec1, vec1);
3851     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3852     addptr(result, stride2);
3853     subl(cnt2, stride2);
3854     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3855     // clean upper bits of YMM registers
3856     vpxor(vec1, vec1);
3857 
3858     // compare wide vectors tail
3859     bind(COMPARE_WIDE_TAIL);
3860     testptr(result, result);
3861     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3862 
3863     movl(result, stride2);
3864     movl(cnt2, result);
3865     negptr(result);
3866     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3867 
3868     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3869     bind(VECTOR_NOT_EQUAL);
3870     // clean upper bits of YMM registers
3871     vpxor(vec1, vec1);
3872     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3873       lea(str1, Address(str1, result, scale));
3874       lea(str2, Address(str2, result, scale));
3875     } else {
3876       lea(str1, Address(str1, result, scale1));
3877       lea(str2, Address(str2, result, scale2));
3878     }
3879     jmp(COMPARE_16_CHARS);
3880 
3881     // Compare tail chars, length between 1 to 15 chars
3882     bind(COMPARE_TAIL_LONG);
3883     movl(cnt2, result);
3884     cmpl(cnt2, stride);
3885     jcc(Assembler::less, COMPARE_SMALL_STR);
3886 
3887     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3888       movdqu(vec1, Address(str1, 0));
3889     } else {
3890       pmovzxbw(vec1, Address(str1, 0));
3891     }
3892     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3893     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3894     subptr(cnt2, stride);
3895     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3896     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3897       lea(str1, Address(str1, result, scale));
3898       lea(str2, Address(str2, result, scale));
3899     } else {
3900       lea(str1, Address(str1, result, scale1));
3901       lea(str2, Address(str2, result, scale2));
3902     }
3903     negptr(cnt2);
3904     jmpb(WHILE_HEAD_LABEL);
3905 
3906     bind(COMPARE_SMALL_STR);
3907   } else if (UseSSE42Intrinsics) {
3908     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3909     int pcmpmask = 0x19;
3910     // Setup to compare 8-char (16-byte) vectors,
3911     // start from first character again because it has aligned address.
3912     movl(result, cnt2);
3913     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3914     if (ae == StrIntrinsicNode::LL) {
3915       pcmpmask &= ~0x01;
3916     }
3917     jcc(Assembler::zero, COMPARE_TAIL);
3918     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3919       lea(str1, Address(str1, result, scale));
3920       lea(str2, Address(str2, result, scale));
3921     } else {
3922       lea(str1, Address(str1, result, scale1));
3923       lea(str2, Address(str2, result, scale2));
3924     }
3925     negptr(result);
3926 
3927     // pcmpestri
3928     //   inputs:
3929     //     vec1- substring
3930     //     rax - negative string length (elements count)
3931     //     mem - scanned string
3932     //     rdx - string length (elements count)
3933     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3934     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3935     //   outputs:
3936     //     rcx - first mismatched element index
3937     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3938 
3939     bind(COMPARE_WIDE_VECTORS);
3940     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3941       movdqu(vec1, Address(str1, result, scale));
3942       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3943     } else {
3944       pmovzxbw(vec1, Address(str1, result, scale1));
3945       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3946     }
3947     // After pcmpestri cnt1(rcx) contains mismatched element index
3948 
3949     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3950     addptr(result, stride);
3951     subptr(cnt2, stride);
3952     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3953 
3954     // compare wide vectors tail
3955     testptr(result, result);
3956     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3957 
3958     movl(cnt2, stride);
3959     movl(result, stride);
3960     negptr(result);
3961     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3962       movdqu(vec1, Address(str1, result, scale));
3963       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3964     } else {
3965       pmovzxbw(vec1, Address(str1, result, scale1));
3966       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3967     }
3968     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3969 
3970     // Mismatched characters in the vectors
3971     bind(VECTOR_NOT_EQUAL);
3972     addptr(cnt1, result);
3973     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3974     subl(result, cnt2);
3975     jmpb(POP_LABEL);
3976 
3977     bind(COMPARE_TAIL); // limit is zero
3978     movl(cnt2, result);
3979     // Fallthru to tail compare
3980   }
3981   // Shift str2 and str1 to the end of the arrays, negate min
3982   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3983     lea(str1, Address(str1, cnt2, scale));
3984     lea(str2, Address(str2, cnt2, scale));
3985   } else {
3986     lea(str1, Address(str1, cnt2, scale1));
3987     lea(str2, Address(str2, cnt2, scale2));
3988   }
3989   decrementl(cnt2);  // first character was compared already
3990   negptr(cnt2);
3991 
3992   // Compare the rest of the elements
3993   bind(WHILE_HEAD_LABEL);
3994   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3995   subl(result, cnt1);
3996   jccb(Assembler::notZero, POP_LABEL);
3997   increment(cnt2);
3998   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3999 
4000   // Strings are equal up to min length.  Return the length difference.
4001   bind(LENGTH_DIFF_LABEL);
4002   pop(result);
4003   if (ae == StrIntrinsicNode::UU) {
4004     // Divide diff by 2 to get number of chars
4005     sarl(result, 1);
4006   }
4007   jmpb(DONE_LABEL);
4008 
4009   if (VM_Version::supports_avx512vlbw()) {
4010 
4011     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
4012 
4013     kmovql(cnt1, mask);
4014     notq(cnt1);
4015     bsfq(cnt2, cnt1);
4016     if (ae != StrIntrinsicNode::LL) {
4017       // Divide diff by 2 to get number of chars
4018       sarl(cnt2, 1);
4019     }
4020     addq(result, cnt2);
4021     if (ae == StrIntrinsicNode::LL) {
4022       load_unsigned_byte(cnt1, Address(str2, result));
4023       load_unsigned_byte(result, Address(str1, result));
4024     } else if (ae == StrIntrinsicNode::UU) {
4025       load_unsigned_short(cnt1, Address(str2, result, scale));
4026       load_unsigned_short(result, Address(str1, result, scale));
4027     } else {
4028       load_unsigned_short(cnt1, Address(str2, result, scale2));
4029       load_unsigned_byte(result, Address(str1, result, scale1));
4030     }
4031     subl(result, cnt1);
4032     jmpb(POP_LABEL);
4033   }//if (VM_Version::supports_avx512vlbw())
4034 
4035   // Discard the stored length difference
4036   bind(POP_LABEL);
4037   pop(cnt1);
4038 
4039   // That's it
4040   bind(DONE_LABEL);
4041   if(ae == StrIntrinsicNode::UL) {
4042     negl(result);
4043   }
4044 
4045 }
4046 
4047 // Search for Non-ASCII character (Negative byte value) in a byte array,
4048 // return the index of the first such character, otherwise the length
4049 // of the array segment searched.
4050 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4051 //   @IntrinsicCandidate
4052 //   public static int countPositives(byte[] ba, int off, int len) {
4053 //     for (int i = off; i < off + len; i++) {
4054 //       if (ba[i] < 0) {
4055 //         return i - off;
4056 //       }
4057 //     }
4058 //     return len;
4059 //   }
4060 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4061   Register result, Register tmp1,
4062   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4063   // rsi: byte array
4064   // rcx: len
4065   // rax: result
4066   ShortBranchVerifier sbv(this);
4067   assert_different_registers(ary1, len, result, tmp1);
4068   assert_different_registers(vec1, vec2);
4069   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4070 
4071   movl(result, len); // copy
4072   // len == 0
4073   testl(len, len);
4074   jcc(Assembler::zero, DONE);
4075 
4076   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4077     VM_Version::supports_avx512vlbw() &&
4078     VM_Version::supports_bmi2()) {
4079 
4080     Label test_64_loop, test_tail, BREAK_LOOP;
4081     movl(tmp1, len);
4082     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4083 
4084     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4085     andl(len,  0xffffffc0); // vector count (in chars)
4086     jccb(Assembler::zero, test_tail);
4087 
4088     lea(ary1, Address(ary1, len, Address::times_1));
4089     negptr(len);
4090 
4091     bind(test_64_loop);
4092     // Check whether our 64 elements of size byte contain negatives
4093     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4094     kortestql(mask1, mask1);
4095     jcc(Assembler::notZero, BREAK_LOOP);
4096 
4097     addptr(len, 64);
4098     jccb(Assembler::notZero, test_64_loop);
4099 
4100     bind(test_tail);
4101     // bail out when there is nothing to be done
4102     testl(tmp1, -1);
4103     jcc(Assembler::zero, DONE);
4104 
4105 
4106     // check the tail for absense of negatives
4107     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4108     {
4109       Register tmp3_aliased = len;
4110       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4111       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4112       notq(tmp3_aliased);
4113       kmovql(mask2, tmp3_aliased);
4114     }
4115 
4116     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4117     ktestq(mask1, mask2);
4118     jcc(Assembler::zero, DONE);
4119 
4120     // do a full check for negative registers in the tail
4121     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4122                      // ary1 already pointing to the right place
4123     jmpb(TAIL_START);
4124 
4125     bind(BREAK_LOOP);
4126     // At least one byte in the last 64 byte block was negative.
4127     // Set up to look at the last 64 bytes as if they were a tail
4128     lea(ary1, Address(ary1, len, Address::times_1));
4129     addptr(result, len);
4130     // Ignore the very last byte: if all others are positive,
4131     // it must be negative, so we can skip right to the 2+1 byte
4132     // end comparison at this point
4133     orl(result, 63);
4134     movl(len, 63);
4135     // Fallthru to tail compare
4136   } else {
4137 
4138     if (UseAVX >= 2) {
4139       // With AVX2, use 32-byte vector compare
4140       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4141 
4142       // Compare 32-byte vectors
4143       testl(len, 0xffffffe0);   // vector count (in bytes)
4144       jccb(Assembler::zero, TAIL_START);
4145 
4146       andl(len, 0xffffffe0);
4147       lea(ary1, Address(ary1, len, Address::times_1));
4148       negptr(len);
4149 
4150       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4151       movdl(vec2, tmp1);
4152       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4153 
4154       bind(COMPARE_WIDE_VECTORS);
4155       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4156       vptest(vec1, vec2);
4157       jccb(Assembler::notZero, BREAK_LOOP);
4158       addptr(len, 32);
4159       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4160 
4161       testl(result, 0x0000001f);   // any bytes remaining?
4162       jcc(Assembler::zero, DONE);
4163 
4164       // Quick test using the already prepared vector mask
4165       movl(len, result);
4166       andl(len, 0x0000001f);
4167       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4168       vptest(vec1, vec2);
4169       jcc(Assembler::zero, DONE);
4170       // There are zeros, jump to the tail to determine exactly where
4171       jmpb(TAIL_START);
4172 
4173       bind(BREAK_LOOP);
4174       // At least one byte in the last 32-byte vector is negative.
4175       // Set up to look at the last 32 bytes as if they were a tail
4176       lea(ary1, Address(ary1, len, Address::times_1));
4177       addptr(result, len);
4178       // Ignore the very last byte: if all others are positive,
4179       // it must be negative, so we can skip right to the 2+1 byte
4180       // end comparison at this point
4181       orl(result, 31);
4182       movl(len, 31);
4183       // Fallthru to tail compare
4184     } else if (UseSSE42Intrinsics) {
4185       // With SSE4.2, use double quad vector compare
4186       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4187 
4188       // Compare 16-byte vectors
4189       testl(len, 0xfffffff0);   // vector count (in bytes)
4190       jcc(Assembler::zero, TAIL_START);
4191 
4192       andl(len, 0xfffffff0);
4193       lea(ary1, Address(ary1, len, Address::times_1));
4194       negptr(len);
4195 
4196       movl(tmp1, 0x80808080);
4197       movdl(vec2, tmp1);
4198       pshufd(vec2, vec2, 0);
4199 
4200       bind(COMPARE_WIDE_VECTORS);
4201       movdqu(vec1, Address(ary1, len, Address::times_1));
4202       ptest(vec1, vec2);
4203       jccb(Assembler::notZero, BREAK_LOOP);
4204       addptr(len, 16);
4205       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4206 
4207       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4208       jcc(Assembler::zero, DONE);
4209 
4210       // Quick test using the already prepared vector mask
4211       movl(len, result);
4212       andl(len, 0x0000000f);   // tail count (in bytes)
4213       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4214       ptest(vec1, vec2);
4215       jcc(Assembler::zero, DONE);
4216       jmpb(TAIL_START);
4217 
4218       bind(BREAK_LOOP);
4219       // At least one byte in the last 16-byte vector is negative.
4220       // Set up and look at the last 16 bytes as if they were a tail
4221       lea(ary1, Address(ary1, len, Address::times_1));
4222       addptr(result, len);
4223       // Ignore the very last byte: if all others are positive,
4224       // it must be negative, so we can skip right to the 2+1 byte
4225       // end comparison at this point
4226       orl(result, 15);
4227       movl(len, 15);
4228       // Fallthru to tail compare
4229     }
4230   }
4231 
4232   bind(TAIL_START);
4233   // Compare 4-byte vectors
4234   andl(len, 0xfffffffc); // vector count (in bytes)
4235   jccb(Assembler::zero, COMPARE_CHAR);
4236 
4237   lea(ary1, Address(ary1, len, Address::times_1));
4238   negptr(len);
4239 
4240   bind(COMPARE_VECTORS);
4241   movl(tmp1, Address(ary1, len, Address::times_1));
4242   andl(tmp1, 0x80808080);
4243   jccb(Assembler::notZero, TAIL_ADJUST);
4244   addptr(len, 4);
4245   jccb(Assembler::notZero, COMPARE_VECTORS);
4246 
4247   // Compare trailing char (final 2-3 bytes), if any
4248   bind(COMPARE_CHAR);
4249 
4250   testl(result, 0x2);   // tail  char
4251   jccb(Assembler::zero, COMPARE_BYTE);
4252   load_unsigned_short(tmp1, Address(ary1, 0));
4253   andl(tmp1, 0x00008080);
4254   jccb(Assembler::notZero, CHAR_ADJUST);
4255   lea(ary1, Address(ary1, 2));
4256 
4257   bind(COMPARE_BYTE);
4258   testl(result, 0x1);   // tail  byte
4259   jccb(Assembler::zero, DONE);
4260   load_unsigned_byte(tmp1, Address(ary1, 0));
4261   testl(tmp1, 0x00000080);
4262   jccb(Assembler::zero, DONE);
4263   subptr(result, 1);
4264   jmpb(DONE);
4265 
4266   bind(TAIL_ADJUST);
4267   // there are negative bits in the last 4 byte block.
4268   // Adjust result and check the next three bytes
4269   addptr(result, len);
4270   orl(result, 3);
4271   lea(ary1, Address(ary1, len, Address::times_1));
4272   jmpb(COMPARE_CHAR);
4273 
4274   bind(CHAR_ADJUST);
4275   // We are looking at a char + optional byte tail, and found that one
4276   // of the bytes in the char is negative. Adjust the result, check the
4277   // first byte and readjust if needed.
4278   andl(result, 0xfffffffc);
4279   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4280   jccb(Assembler::notZero, DONE);
4281   addptr(result, 1);
4282 
4283   // That's it
4284   bind(DONE);
4285   if (UseAVX >= 2) {
4286     // clean upper bits of YMM registers
4287     vpxor(vec1, vec1);
4288     vpxor(vec2, vec2);
4289   }
4290 }
4291 
4292 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4293 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4294                                       Register limit, Register result, Register chr,
4295                                       XMMRegister vec1, XMMRegister vec2, bool is_char,
4296                                       KRegister mask, bool expand_ary2) {
4297   // for expand_ary2, limit is the (smaller) size of the second array.
4298   ShortBranchVerifier sbv(this);
4299   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4300 
4301   assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4302          "Expansion only implemented for AVX2");
4303 
4304   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4305   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4306 
4307   Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4308   int scaleIncr = expand_ary2 ? 8 : 16;
4309 
4310   if (is_array_equ) {
4311     // Check the input args
4312     cmpoop(ary1, ary2);
4313     jcc(Assembler::equal, TRUE_LABEL);
4314 
4315     // Need additional checks for arrays_equals.
4316     testptr(ary1, ary1);
4317     jcc(Assembler::zero, FALSE_LABEL);
4318     testptr(ary2, ary2);
4319     jcc(Assembler::zero, FALSE_LABEL);
4320 
4321     // Check the lengths
4322     movl(limit, Address(ary1, length_offset));
4323     cmpl(limit, Address(ary2, length_offset));
4324     jcc(Assembler::notEqual, FALSE_LABEL);
4325   }
4326 
4327   // count == 0
4328   testl(limit, limit);
4329   jcc(Assembler::zero, TRUE_LABEL);
4330 
4331   if (is_array_equ) {
4332     // Load array address
4333     lea(ary1, Address(ary1, base_offset));
4334     lea(ary2, Address(ary2, base_offset));
4335   }
4336 
4337   if (is_array_equ && is_char) {
4338     // arrays_equals when used for char[].
4339     shll(limit, 1);      // byte count != 0
4340   }
4341   movl(result, limit); // copy
4342 
4343   if (UseAVX >= 2) {
4344     // With AVX2, use 32-byte vector compare
4345     Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4346 
4347     // Compare 32-byte vectors
4348     if (expand_ary2) {
4349       andl(result, 0x0000000f);  //   tail count (in bytes)
4350       andl(limit, 0xfffffff0);   // vector count (in bytes)
4351       jcc(Assembler::zero, COMPARE_TAIL);
4352     } else {
4353       andl(result, 0x0000001f);  //   tail count (in bytes)
4354       andl(limit, 0xffffffe0);   // vector count (in bytes)
4355       jcc(Assembler::zero, COMPARE_TAIL_16);
4356     }
4357 
4358     lea(ary1, Address(ary1, limit, scaleFactor));
4359     lea(ary2, Address(ary2, limit, Address::times_1));
4360     negptr(limit);
4361 
4362     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4363       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4364 
4365       cmpl(limit, -64);
4366       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4367 
4368       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4369 
4370       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4371       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4372       kortestql(mask, mask);
4373       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4374       addptr(limit, 64);  // update since we already compared at this addr
4375       cmpl(limit, -64);
4376       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4377 
4378       // At this point we may still need to compare -limit+result bytes.
4379       // We could execute the next two instruction and just continue via non-wide path:
4380       //  cmpl(limit, 0);
4381       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4382       // But since we stopped at the points ary{1,2}+limit which are
4383       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4384       // (|limit| <= 32 and result < 32),
4385       // we may just compare the last 64 bytes.
4386       //
4387       addptr(result, -64);   // it is safe, bc we just came from this area
4388       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4389       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4390       kortestql(mask, mask);
4391       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4392 
4393       jmp(TRUE_LABEL);
4394 
4395       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4396 
4397     }//if (VM_Version::supports_avx512vlbw())
4398 
4399     bind(COMPARE_WIDE_VECTORS);
4400     vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4401     if (expand_ary2) {
4402       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4403     } else {
4404       vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4405     }
4406     vpxor(vec1, vec2);
4407 
4408     vptest(vec1, vec1);
4409     jcc(Assembler::notZero, FALSE_LABEL);
4410     addptr(limit, scaleIncr * 2);
4411     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4412 
4413     testl(result, result);
4414     jcc(Assembler::zero, TRUE_LABEL);
4415 
4416     vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4417     if (expand_ary2) {
4418       vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4419     } else {
4420       vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4421     }
4422     vpxor(vec1, vec2);
4423 
4424     vptest(vec1, vec1);
4425     jcc(Assembler::notZero, FALSE_LABEL);
4426     jmp(TRUE_LABEL);
4427 
4428     bind(COMPARE_TAIL_16); // limit is zero
4429     movl(limit, result);
4430 
4431     // Compare 16-byte chunks
4432     andl(result, 0x0000000f);  //   tail count (in bytes)
4433     andl(limit, 0xfffffff0);   // vector count (in bytes)
4434     jcc(Assembler::zero, COMPARE_TAIL);
4435 
4436     lea(ary1, Address(ary1, limit, scaleFactor));
4437     lea(ary2, Address(ary2, limit, Address::times_1));
4438     negptr(limit);
4439 
4440     bind(COMPARE_WIDE_VECTORS_16);
4441     movdqu(vec1, Address(ary1, limit, scaleFactor));
4442     if (expand_ary2) {
4443       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4444     } else {
4445       movdqu(vec2, Address(ary2, limit, Address::times_1));
4446     }
4447     pxor(vec1, vec2);
4448 
4449     ptest(vec1, vec1);
4450     jcc(Assembler::notZero, FALSE_LABEL);
4451     addptr(limit, scaleIncr);
4452     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4453 
4454     bind(COMPARE_TAIL); // limit is zero
4455     movl(limit, result);
4456     // Fallthru to tail compare
4457   } else if (UseSSE42Intrinsics) {
4458     // With SSE4.2, use double quad vector compare
4459     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4460 
4461     // Compare 16-byte vectors
4462     andl(result, 0x0000000f);  //   tail count (in bytes)
4463     andl(limit, 0xfffffff0);   // vector count (in bytes)
4464     jcc(Assembler::zero, COMPARE_TAIL);
4465 
4466     lea(ary1, Address(ary1, limit, Address::times_1));
4467     lea(ary2, Address(ary2, limit, Address::times_1));
4468     negptr(limit);
4469 
4470     bind(COMPARE_WIDE_VECTORS);
4471     movdqu(vec1, Address(ary1, limit, Address::times_1));
4472     movdqu(vec2, Address(ary2, limit, Address::times_1));
4473     pxor(vec1, vec2);
4474 
4475     ptest(vec1, vec1);
4476     jcc(Assembler::notZero, FALSE_LABEL);
4477     addptr(limit, 16);
4478     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4479 
4480     testl(result, result);
4481     jcc(Assembler::zero, TRUE_LABEL);
4482 
4483     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4484     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4485     pxor(vec1, vec2);
4486 
4487     ptest(vec1, vec1);
4488     jccb(Assembler::notZero, FALSE_LABEL);
4489     jmpb(TRUE_LABEL);
4490 
4491     bind(COMPARE_TAIL); // limit is zero
4492     movl(limit, result);
4493     // Fallthru to tail compare
4494   }
4495 
4496   // Compare 4-byte vectors
4497   if (expand_ary2) {
4498     testl(result, result);
4499     jccb(Assembler::zero, TRUE_LABEL);
4500   } else {
4501     andl(limit, 0xfffffffc); // vector count (in bytes)
4502     jccb(Assembler::zero, COMPARE_CHAR);
4503   }
4504 
4505   lea(ary1, Address(ary1, limit, scaleFactor));
4506   lea(ary2, Address(ary2, limit, Address::times_1));
4507   negptr(limit);
4508 
4509   bind(COMPARE_VECTORS);
4510   if (expand_ary2) {
4511     // There are no "vector" operations for bytes to shorts
4512     movzbl(chr, Address(ary2, limit, Address::times_1));
4513     cmpw(Address(ary1, limit, Address::times_2), chr);
4514     jccb(Assembler::notEqual, FALSE_LABEL);
4515     addptr(limit, 1);
4516     jcc(Assembler::notZero, COMPARE_VECTORS);
4517     jmp(TRUE_LABEL);
4518   } else {
4519     movl(chr, Address(ary1, limit, Address::times_1));
4520     cmpl(chr, Address(ary2, limit, Address::times_1));
4521     jccb(Assembler::notEqual, FALSE_LABEL);
4522     addptr(limit, 4);
4523     jcc(Assembler::notZero, COMPARE_VECTORS);
4524   }
4525 
4526   // Compare trailing char (final 2 bytes), if any
4527   bind(COMPARE_CHAR);
4528   testl(result, 0x2);   // tail  char
4529   jccb(Assembler::zero, COMPARE_BYTE);
4530   load_unsigned_short(chr, Address(ary1, 0));
4531   load_unsigned_short(limit, Address(ary2, 0));
4532   cmpl(chr, limit);
4533   jccb(Assembler::notEqual, FALSE_LABEL);
4534 
4535   if (is_array_equ && is_char) {
4536     bind(COMPARE_BYTE);
4537   } else {
4538     lea(ary1, Address(ary1, 2));
4539     lea(ary2, Address(ary2, 2));
4540 
4541     bind(COMPARE_BYTE);
4542     testl(result, 0x1);   // tail  byte
4543     jccb(Assembler::zero, TRUE_LABEL);
4544     load_unsigned_byte(chr, Address(ary1, 0));
4545     load_unsigned_byte(limit, Address(ary2, 0));
4546     cmpl(chr, limit);
4547     jccb(Assembler::notEqual, FALSE_LABEL);
4548   }
4549   bind(TRUE_LABEL);
4550   movl(result, 1);   // return true
4551   jmpb(DONE);
4552 
4553   bind(FALSE_LABEL);
4554   xorl(result, result); // return false
4555 
4556   // That's it
4557   bind(DONE);
4558   if (UseAVX >= 2) {
4559     // clean upper bits of YMM registers
4560     vpxor(vec1, vec1);
4561     vpxor(vec2, vec2);
4562   }
4563 }
4564 
4565 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4566 #define __ masm.
4567   Register dst = stub.data<0>();
4568   XMMRegister src = stub.data<1>();
4569   address target = stub.data<2>();
4570   __ bind(stub.entry());
4571   __ subptr(rsp, 8);
4572   __ movdbl(Address(rsp), src);
4573   __ call(RuntimeAddress(target));
4574   __ pop(dst);
4575   __ jmp(stub.continuation());
4576 #undef __
4577 }
4578 
4579 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4580   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4581   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4582 
4583   address slowpath_target;
4584   if (dst_bt == T_INT) {
4585     if (src_bt == T_FLOAT) {
4586       cvttss2sil(dst, src);
4587       cmpl(dst, 0x80000000);
4588       slowpath_target = StubRoutines::x86::f2i_fixup();
4589     } else {
4590       cvttsd2sil(dst, src);
4591       cmpl(dst, 0x80000000);
4592       slowpath_target = StubRoutines::x86::d2i_fixup();
4593     }
4594   } else {
4595     if (src_bt == T_FLOAT) {
4596       cvttss2siq(dst, src);
4597       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4598       slowpath_target = StubRoutines::x86::f2l_fixup();
4599     } else {
4600       cvttsd2siq(dst, src);
4601       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4602       slowpath_target = StubRoutines::x86::d2l_fixup();
4603     }
4604   }
4605 
4606   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath);
4607   jcc(Assembler::equal, stub->entry());
4608   bind(stub->continuation());
4609 }
4610 
4611 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4612                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4613   switch(ideal_opc) {
4614     case Op_LShiftVS:
4615       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4616     case Op_LShiftVI:
4617       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4618     case Op_LShiftVL:
4619       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4620     case Op_RShiftVS:
4621       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4622     case Op_RShiftVI:
4623       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4624     case Op_RShiftVL:
4625       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4626     case Op_URShiftVS:
4627       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4628     case Op_URShiftVI:
4629       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4630     case Op_URShiftVL:
4631       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4632     case Op_RotateRightV:
4633       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4634     case Op_RotateLeftV:
4635       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4636     default:
4637       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4638       break;
4639   }
4640 }
4641 
4642 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4643                                                XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4644   if (is_unsigned) {
4645     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4646   } else {
4647     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4648   }
4649 }
4650 
4651 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4652                                                       XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4653   switch (elem_bt) {
4654     case T_BYTE:
4655       if (ideal_opc == Op_SaturatingAddV) {
4656         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4657       } else {
4658         assert(ideal_opc == Op_SaturatingSubV, "");
4659         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4660       }
4661       break;
4662     case T_SHORT:
4663       if (ideal_opc == Op_SaturatingAddV) {
4664         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4665       } else {
4666         assert(ideal_opc == Op_SaturatingSubV, "");
4667         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4668       }
4669       break;
4670     default:
4671       fatal("Unsupported type %s", type2name(elem_bt));
4672       break;
4673   }
4674 }
4675 
4676 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4677                                                         XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4678   switch (elem_bt) {
4679     case T_BYTE:
4680       if (ideal_opc == Op_SaturatingAddV) {
4681         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4682       } else {
4683         assert(ideal_opc == Op_SaturatingSubV, "");
4684         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4685       }
4686       break;
4687     case T_SHORT:
4688       if (ideal_opc == Op_SaturatingAddV) {
4689         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4690       } else {
4691         assert(ideal_opc == Op_SaturatingSubV, "");
4692         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4693       }
4694       break;
4695     default:
4696       fatal("Unsupported type %s", type2name(elem_bt));
4697       break;
4698   }
4699 }
4700 
4701 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4702                                                Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4703   if (is_unsigned) {
4704     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4705   } else {
4706     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4707   }
4708 }
4709 
4710 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4711                                                       XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4712   switch (elem_bt) {
4713     case T_BYTE:
4714       if (ideal_opc == Op_SaturatingAddV) {
4715         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4716       } else {
4717         assert(ideal_opc == Op_SaturatingSubV, "");
4718         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4719       }
4720       break;
4721     case T_SHORT:
4722       if (ideal_opc == Op_SaturatingAddV) {
4723         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4724       } else {
4725         assert(ideal_opc == Op_SaturatingSubV, "");
4726         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4727       }
4728       break;
4729     default:
4730       fatal("Unsupported type %s", type2name(elem_bt));
4731       break;
4732   }
4733 }
4734 
4735 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4736                                                         XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4737   switch (elem_bt) {
4738     case T_BYTE:
4739       if (ideal_opc == Op_SaturatingAddV) {
4740         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4741       } else {
4742         assert(ideal_opc == Op_SaturatingSubV, "");
4743         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4744       }
4745       break;
4746     case T_SHORT:
4747       if (ideal_opc == Op_SaturatingAddV) {
4748         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4749       } else {
4750         assert(ideal_opc == Op_SaturatingSubV, "");
4751         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4752       }
4753       break;
4754     default:
4755       fatal("Unsupported type %s", type2name(elem_bt));
4756       break;
4757   }
4758 }
4759 
4760 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4761                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4762                                     bool is_varshift) {
4763   switch (ideal_opc) {
4764     case Op_AddVB:
4765       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4766     case Op_AddVS:
4767       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4768     case Op_AddVI:
4769       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4770     case Op_AddVL:
4771       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4772     case Op_AddVF:
4773       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4774     case Op_AddVD:
4775       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4776     case Op_SubVB:
4777       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4778     case Op_SubVS:
4779       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4780     case Op_SubVI:
4781       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4782     case Op_SubVL:
4783       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4784     case Op_SubVF:
4785       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4786     case Op_SubVD:
4787       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4788     case Op_MulVS:
4789       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4790     case Op_MulVI:
4791       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4792     case Op_MulVL:
4793       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4794     case Op_MulVF:
4795       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4796     case Op_MulVD:
4797       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4798     case Op_DivVF:
4799       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4800     case Op_DivVD:
4801       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4802     case Op_SqrtVF:
4803       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4804     case Op_SqrtVD:
4805       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4806     case Op_AbsVB:
4807       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4808     case Op_AbsVS:
4809       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4810     case Op_AbsVI:
4811       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4812     case Op_AbsVL:
4813       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4814     case Op_FmaVF:
4815       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4816     case Op_FmaVD:
4817       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4818     case Op_VectorRearrange:
4819       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4820     case Op_LShiftVS:
4821       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4822     case Op_LShiftVI:
4823       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4824     case Op_LShiftVL:
4825       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4826     case Op_RShiftVS:
4827       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4828     case Op_RShiftVI:
4829       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4830     case Op_RShiftVL:
4831       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4832     case Op_URShiftVS:
4833       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4834     case Op_URShiftVI:
4835       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4836     case Op_URShiftVL:
4837       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4838     case Op_RotateLeftV:
4839       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4840     case Op_RotateRightV:
4841       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4842     case Op_MaxV:
4843       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4844     case Op_MinV:
4845       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4846     case Op_UMinV:
4847       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4848     case Op_UMaxV:
4849       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4850     case Op_XorV:
4851       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4852     case Op_OrV:
4853       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4854     case Op_AndV:
4855       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4856     default:
4857       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4858       break;
4859   }
4860 }
4861 
4862 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4863                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4864   switch (ideal_opc) {
4865     case Op_AddVB:
4866       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4867     case Op_AddVS:
4868       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4869     case Op_AddVI:
4870       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4871     case Op_AddVL:
4872       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4873     case Op_AddVF:
4874       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4875     case Op_AddVD:
4876       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4877     case Op_SubVB:
4878       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4879     case Op_SubVS:
4880       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4881     case Op_SubVI:
4882       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4883     case Op_SubVL:
4884       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4885     case Op_SubVF:
4886       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4887     case Op_SubVD:
4888       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4889     case Op_MulVS:
4890       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4891     case Op_MulVI:
4892       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4893     case Op_MulVL:
4894       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4895     case Op_MulVF:
4896       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4897     case Op_MulVD:
4898       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4899     case Op_DivVF:
4900       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4901     case Op_DivVD:
4902       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4903     case Op_FmaVF:
4904       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4905     case Op_FmaVD:
4906       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4907     case Op_MaxV:
4908       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4909     case Op_MinV:
4910       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4911     case Op_UMaxV:
4912       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4913     case Op_UMinV:
4914       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4915     case Op_XorV:
4916       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4917     case Op_OrV:
4918       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4919     case Op_AndV:
4920       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4921     default:
4922       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4923       break;
4924   }
4925 }
4926 
4927 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4928                                   KRegister src1, KRegister src2) {
4929   BasicType etype = T_ILLEGAL;
4930   switch(mask_len) {
4931     case 2:
4932     case 4:
4933     case 8:  etype = T_BYTE; break;
4934     case 16: etype = T_SHORT; break;
4935     case 32: etype = T_INT; break;
4936     case 64: etype = T_LONG; break;
4937     default: fatal("Unsupported type"); break;
4938   }
4939   assert(etype != T_ILLEGAL, "");
4940   switch(ideal_opc) {
4941     case Op_AndVMask:
4942       kand(etype, dst, src1, src2); break;
4943     case Op_OrVMask:
4944       kor(etype, dst, src1, src2); break;
4945     case Op_XorVMask:
4946       kxor(etype, dst, src1, src2); break;
4947     default:
4948       fatal("Unsupported masked operation"); break;
4949   }
4950 }
4951 
4952 /*
4953  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4954  * If src is NaN, the result is 0.
4955  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4956  * the result is equal to the value of Integer.MIN_VALUE.
4957  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4958  * the result is equal to the value of Integer.MAX_VALUE.
4959  */
4960 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4961                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4962                                                                    Register rscratch, AddressLiteral float_sign_flip,
4963                                                                    int vec_enc) {
4964   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4965   Label done;
4966   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4967   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4968   vptest(xtmp2, xtmp2, vec_enc);
4969   jccb(Assembler::equal, done);
4970 
4971   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4972   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4973 
4974   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4975   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4976   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4977 
4978   // Recompute the mask for remaining special value.
4979   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4980   // Extract SRC values corresponding to TRUE mask lanes.
4981   vpand(xtmp4, xtmp2, src, vec_enc);
4982   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4983   // values are set.
4984   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4985 
4986   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4987   bind(done);
4988 }
4989 
4990 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4991                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4992                                                                     Register rscratch, AddressLiteral float_sign_flip,
4993                                                                     int vec_enc) {
4994   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4995   Label done;
4996   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4997   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4998   kortestwl(ktmp1, ktmp1);
4999   jccb(Assembler::equal, done);
5000 
5001   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5002   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5003   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5004 
5005   kxorwl(ktmp1, ktmp1, ktmp2);
5006   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5007   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5008   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5009   bind(done);
5010 }
5011 
5012 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5013                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5014                                                                      Register rscratch, AddressLiteral double_sign_flip,
5015                                                                      int vec_enc) {
5016   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5017 
5018   Label done;
5019   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5020   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
5021   kortestwl(ktmp1, ktmp1);
5022   jccb(Assembler::equal, done);
5023 
5024   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5025   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5026   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5027 
5028   kxorwl(ktmp1, ktmp1, ktmp2);
5029   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5030   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5031   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5032   bind(done);
5033 }
5034 
5035 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5036                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5037                                                                      Register rscratch, AddressLiteral float_sign_flip,
5038                                                                      int vec_enc) {
5039   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5040   Label done;
5041   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5042   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5043   kortestwl(ktmp1, ktmp1);
5044   jccb(Assembler::equal, done);
5045 
5046   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5047   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5048   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5049 
5050   kxorwl(ktmp1, ktmp1, ktmp2);
5051   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5052   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5053   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5054   bind(done);
5055 }
5056 
5057 /*
5058  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5059  * If src is NaN, the result is 0.
5060  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
5061  * the result is equal to the value of Long.MIN_VALUE.
5062  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
5063  * the result is equal to the value of Long.MAX_VALUE.
5064  */
5065 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5066                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5067                                                                       Register rscratch, AddressLiteral double_sign_flip,
5068                                                                       int vec_enc) {
5069   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5070 
5071   Label done;
5072   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5073   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
5074   kortestwl(ktmp1, ktmp1);
5075   jccb(Assembler::equal, done);
5076 
5077   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5078   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5079   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5080 
5081   kxorwl(ktmp1, ktmp1, ktmp2);
5082   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5083   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5084   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5085   bind(done);
5086 }
5087 
5088 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5089                                                              XMMRegister xtmp, int index, int vec_enc) {
5090    assert(vec_enc < Assembler::AVX_512bit, "");
5091    if (vec_enc == Assembler::AVX_256bit) {
5092      vextractf128_high(xtmp, src);
5093      vshufps(dst, src, xtmp, index, vec_enc);
5094    } else {
5095      vshufps(dst, src, zero, index, vec_enc);
5096    }
5097 }
5098 
5099 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5100                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5101                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
5102   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5103 
5104   Label done;
5105   // Compare the destination lanes with float_sign_flip
5106   // value to get mask for all special values.
5107   movdqu(xtmp1, float_sign_flip, rscratch);
5108   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5109   ptest(xtmp2, xtmp2);
5110   jccb(Assembler::equal, done);
5111 
5112   // Flip float_sign_flip to get max integer value.
5113   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5114   pxor(xtmp1, xtmp4);
5115 
5116   // Set detination lanes corresponding to unordered source lanes as zero.
5117   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5118   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5119 
5120   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5121   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5122   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5123 
5124   // Recompute the mask for remaining special value.
5125   pxor(xtmp2, xtmp3);
5126   // Extract mask corresponding to non-negative source lanes.
5127   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5128 
5129   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5130   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5131   pand(xtmp3, xtmp2);
5132 
5133   // Replace destination lanes holding special value(0x80000000) with max int
5134   // if corresponding source lane holds a +ve value.
5135   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5136   bind(done);
5137 }
5138 
5139 
5140 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5141                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5142   switch(to_elem_bt) {
5143     case T_SHORT:
5144       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5145       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5146       vpackusdw(dst, dst, zero, vec_enc);
5147       if (vec_enc == Assembler::AVX_256bit) {
5148         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5149       }
5150       break;
5151     case  T_BYTE:
5152       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5153       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5154       vpackusdw(dst, dst, zero, vec_enc);
5155       if (vec_enc == Assembler::AVX_256bit) {
5156         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5157       }
5158       vpackuswb(dst, dst, zero, vec_enc);
5159       break;
5160     default: assert(false, "%s", type2name(to_elem_bt));
5161   }
5162 }
5163 
5164 /*
5165  * Algorithm for vector D2L and F2I conversions:-
5166  * a) Perform vector D2L/F2I cast.
5167  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5168  *    It signifies that source value could be any of the special floating point
5169  *    values(NaN,-Inf,Inf,Max,-Min).
5170  * c) Set destination to zero if source is NaN value.
5171  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5172  */
5173 
5174 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5175                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5176                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5177   int to_elem_sz = type2aelembytes(to_elem_bt);
5178   assert(to_elem_sz <= 4, "");
5179   vcvttps2dq(dst, src, vec_enc);
5180   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5181   if (to_elem_sz < 4) {
5182     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5183     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5184   }
5185 }
5186 
5187 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5188                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5189                                             Register rscratch, int vec_enc) {
5190   int to_elem_sz = type2aelembytes(to_elem_bt);
5191   assert(to_elem_sz <= 4, "");
5192   vcvttps2dq(dst, src, vec_enc);
5193   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5194   switch(to_elem_bt) {
5195     case T_INT:
5196       break;
5197     case T_SHORT:
5198       evpmovdw(dst, dst, vec_enc);
5199       break;
5200     case T_BYTE:
5201       evpmovdb(dst, dst, vec_enc);
5202       break;
5203     default: assert(false, "%s", type2name(to_elem_bt));
5204   }
5205 }
5206 
5207 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5208                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5209                                             Register rscratch, int vec_enc) {
5210   evcvttps2qq(dst, src, vec_enc);
5211   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5212 }
5213 
5214 // Handling for downcasting from double to integer or sub-word types on AVX2.
5215 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5216                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5217                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5218   int to_elem_sz = type2aelembytes(to_elem_bt);
5219   assert(to_elem_sz < 8, "");
5220   vcvttpd2dq(dst, src, vec_enc);
5221   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5222                                               float_sign_flip, vec_enc);
5223   if (to_elem_sz < 4) {
5224     // xtmp4 holds all zero lanes.
5225     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5226   }
5227 }
5228 
5229 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5230                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5231                                             KRegister ktmp2, AddressLiteral sign_flip,
5232                                             Register rscratch, int vec_enc) {
5233   if (VM_Version::supports_avx512dq()) {
5234     evcvttpd2qq(dst, src, vec_enc);
5235     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5236     switch(to_elem_bt) {
5237       case T_LONG:
5238         break;
5239       case T_INT:
5240         evpmovsqd(dst, dst, vec_enc);
5241         break;
5242       case T_SHORT:
5243         evpmovsqd(dst, dst, vec_enc);
5244         evpmovdw(dst, dst, vec_enc);
5245         break;
5246       case T_BYTE:
5247         evpmovsqd(dst, dst, vec_enc);
5248         evpmovdb(dst, dst, vec_enc);
5249         break;
5250       default: assert(false, "%s", type2name(to_elem_bt));
5251     }
5252   } else {
5253     assert(type2aelembytes(to_elem_bt) <= 4, "");
5254     vcvttpd2dq(dst, src, vec_enc);
5255     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5256     switch(to_elem_bt) {
5257       case T_INT:
5258         break;
5259       case T_SHORT:
5260         evpmovdw(dst, dst, vec_enc);
5261         break;
5262       case T_BYTE:
5263         evpmovdb(dst, dst, vec_enc);
5264         break;
5265       default: assert(false, "%s", type2name(to_elem_bt));
5266     }
5267   }
5268 }
5269 
5270 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5271                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5272                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5273   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5274   // and re-instantiate original MXCSR.RC mode after that.
5275   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5276 
5277   mov64(tmp, julong_cast(0.5L));
5278   evpbroadcastq(xtmp1, tmp, vec_enc);
5279   vaddpd(xtmp1, src , xtmp1, vec_enc);
5280   evcvtpd2qq(dst, xtmp1, vec_enc);
5281   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5282                                                 double_sign_flip, vec_enc);;
5283 
5284   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5285 }
5286 
5287 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5288                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5289                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5290   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5291   // and re-instantiate original MXCSR.RC mode after that.
5292   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5293 
5294   movl(tmp, jint_cast(0.5));
5295   movq(xtmp1, tmp);
5296   vbroadcastss(xtmp1, xtmp1, vec_enc);
5297   vaddps(xtmp1, src , xtmp1, vec_enc);
5298   vcvtps2dq(dst, xtmp1, vec_enc);
5299   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5300                                               float_sign_flip, vec_enc);
5301 
5302   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5303 }
5304 
5305 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5306                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5307                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5308   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5309   // and re-instantiate original MXCSR.RC mode after that.
5310   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5311 
5312   movl(tmp, jint_cast(0.5));
5313   movq(xtmp1, tmp);
5314   vbroadcastss(xtmp1, xtmp1, vec_enc);
5315   vaddps(xtmp1, src , xtmp1, vec_enc);
5316   vcvtps2dq(dst, xtmp1, vec_enc);
5317   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5318 
5319   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5320 }
5321 
5322 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5323                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5324   switch (from_elem_bt) {
5325     case T_BYTE:
5326       switch (to_elem_bt) {
5327         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5328         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5329         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5330         default: ShouldNotReachHere();
5331       }
5332       break;
5333     case T_SHORT:
5334       switch (to_elem_bt) {
5335         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5336         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5337         default: ShouldNotReachHere();
5338       }
5339       break;
5340     case T_INT:
5341       assert(to_elem_bt == T_LONG, "");
5342       vpmovzxdq(dst, src, vlen_enc);
5343       break;
5344     default:
5345       ShouldNotReachHere();
5346   }
5347 }
5348 
5349 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5350                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5351   switch (from_elem_bt) {
5352     case T_BYTE:
5353       switch (to_elem_bt) {
5354         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5355         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5356         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5357         default: ShouldNotReachHere();
5358       }
5359       break;
5360     case T_SHORT:
5361       switch (to_elem_bt) {
5362         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5363         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5364         default: ShouldNotReachHere();
5365       }
5366       break;
5367     case T_INT:
5368       assert(to_elem_bt == T_LONG, "");
5369       vpmovsxdq(dst, src, vlen_enc);
5370       break;
5371     default:
5372       ShouldNotReachHere();
5373   }
5374 }
5375 
5376 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5377                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5378   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5379   assert(vlen_enc != AVX_512bit, "");
5380 
5381   int dst_bt_size = type2aelembytes(dst_bt);
5382   int src_bt_size = type2aelembytes(src_bt);
5383   if (dst_bt_size > src_bt_size) {
5384     switch (dst_bt_size / src_bt_size) {
5385       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5386       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5387       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5388       default: ShouldNotReachHere();
5389     }
5390   } else {
5391     assert(dst_bt_size < src_bt_size, "");
5392     switch (src_bt_size / dst_bt_size) {
5393       case 2: {
5394         if (vlen_enc == AVX_128bit) {
5395           vpacksswb(dst, src, src, vlen_enc);
5396         } else {
5397           vpacksswb(dst, src, src, vlen_enc);
5398           vpermq(dst, dst, 0x08, vlen_enc);
5399         }
5400         break;
5401       }
5402       case 4: {
5403         if (vlen_enc == AVX_128bit) {
5404           vpackssdw(dst, src, src, vlen_enc);
5405           vpacksswb(dst, dst, dst, vlen_enc);
5406         } else {
5407           vpackssdw(dst, src, src, vlen_enc);
5408           vpermq(dst, dst, 0x08, vlen_enc);
5409           vpacksswb(dst, dst, dst, AVX_128bit);
5410         }
5411         break;
5412       }
5413       case 8: {
5414         if (vlen_enc == AVX_128bit) {
5415           vpshufd(dst, src, 0x08, vlen_enc);
5416           vpackssdw(dst, dst, dst, vlen_enc);
5417           vpacksswb(dst, dst, dst, vlen_enc);
5418         } else {
5419           vpshufd(dst, src, 0x08, vlen_enc);
5420           vpermq(dst, dst, 0x08, vlen_enc);
5421           vpackssdw(dst, dst, dst, AVX_128bit);
5422           vpacksswb(dst, dst, dst, AVX_128bit);
5423         }
5424         break;
5425       }
5426       default: ShouldNotReachHere();
5427     }
5428   }
5429 }
5430 
5431 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5432                                    bool merge, BasicType bt, int vlen_enc) {
5433   if (bt == T_INT) {
5434     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5435   } else {
5436     assert(bt == T_LONG, "");
5437     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5438   }
5439 }
5440 
5441 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5442                                    bool merge, BasicType bt, int vlen_enc) {
5443   if (bt == T_INT) {
5444     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5445   } else {
5446     assert(bt == T_LONG, "");
5447     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5448   }
5449 }
5450 
5451 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5452                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5453                                                int vec_enc) {
5454   int index = 0;
5455   int vindex = 0;
5456   mov64(rtmp1, 0x0101010101010101L);
5457   pdepq(rtmp1, src, rtmp1);
5458   if (mask_len > 8) {
5459     movq(rtmp2, src);
5460     vpxor(xtmp, xtmp, xtmp, vec_enc);
5461     movq(xtmp, rtmp1);
5462   }
5463   movq(dst, rtmp1);
5464 
5465   mask_len -= 8;
5466   while (mask_len > 0) {
5467     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5468     index++;
5469     if ((index % 2) == 0) {
5470       pxor(xtmp, xtmp);
5471     }
5472     mov64(rtmp1, 0x0101010101010101L);
5473     shrq(rtmp2, 8);
5474     pdepq(rtmp1, rtmp2, rtmp1);
5475     pinsrq(xtmp, rtmp1, index % 2);
5476     vindex = index / 2;
5477     if (vindex) {
5478       // Write entire 16 byte vector when both 64 bit
5479       // lanes are update to save redundant instructions.
5480       if (index % 2) {
5481         vinsertf128(dst, dst, xtmp, vindex);
5482       }
5483     } else {
5484       vmovdqu(dst, xtmp);
5485     }
5486     mask_len -= 8;
5487   }
5488 }
5489 
5490 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5491   switch(opc) {
5492     case Op_VectorMaskTrueCount:
5493       popcntq(dst, tmp);
5494       break;
5495     case Op_VectorMaskLastTrue:
5496       if (VM_Version::supports_lzcnt()) {
5497         lzcntq(tmp, tmp);
5498         movl(dst, 63);
5499         subl(dst, tmp);
5500       } else {
5501         movl(dst, -1);
5502         bsrq(tmp, tmp);
5503         cmov32(Assembler::notZero, dst, tmp);
5504       }
5505       break;
5506     case Op_VectorMaskFirstTrue:
5507       if (VM_Version::supports_bmi1()) {
5508         if (masklen < 32) {
5509           orl(tmp, 1 << masklen);
5510           tzcntl(dst, tmp);
5511         } else if (masklen == 32) {
5512           tzcntl(dst, tmp);
5513         } else {
5514           assert(masklen == 64, "");
5515           tzcntq(dst, tmp);
5516         }
5517       } else {
5518         if (masklen < 32) {
5519           orl(tmp, 1 << masklen);
5520           bsfl(dst, tmp);
5521         } else {
5522           assert(masklen == 32 || masklen == 64, "");
5523           movl(dst, masklen);
5524           if (masklen == 32)  {
5525             bsfl(tmp, tmp);
5526           } else {
5527             bsfq(tmp, tmp);
5528           }
5529           cmov32(Assembler::notZero, dst, tmp);
5530         }
5531       }
5532       break;
5533     case Op_VectorMaskToLong:
5534       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5535       break;
5536     default: assert(false, "Unhandled mask operation");
5537   }
5538 }
5539 
5540 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5541                                               int masklen, int masksize, int vec_enc) {
5542   assert(VM_Version::supports_popcnt(), "");
5543 
5544   if(VM_Version::supports_avx512bw()) {
5545     kmovql(tmp, mask);
5546   } else {
5547     assert(masklen <= 16, "");
5548     kmovwl(tmp, mask);
5549   }
5550 
5551   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5552   // operations needs to be clipped.
5553   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5554     andq(tmp, (1 << masklen) - 1);
5555   }
5556 
5557   vector_mask_operation_helper(opc, dst, tmp, masklen);
5558 }
5559 
5560 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5561                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5562   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5563          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5564   assert(VM_Version::supports_popcnt(), "");
5565 
5566   bool need_clip = false;
5567   switch(bt) {
5568     case T_BOOLEAN:
5569       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5570       vpxor(xtmp, xtmp, xtmp, vec_enc);
5571       vpsubb(xtmp, xtmp, mask, vec_enc);
5572       vpmovmskb(tmp, xtmp, vec_enc);
5573       need_clip = masklen < 16;
5574       break;
5575     case T_BYTE:
5576       vpmovmskb(tmp, mask, vec_enc);
5577       need_clip = masklen < 16;
5578       break;
5579     case T_SHORT:
5580       vpacksswb(xtmp, mask, mask, vec_enc);
5581       if (masklen >= 16) {
5582         vpermpd(xtmp, xtmp, 8, vec_enc);
5583       }
5584       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5585       need_clip = masklen < 16;
5586       break;
5587     case T_INT:
5588     case T_FLOAT:
5589       vmovmskps(tmp, mask, vec_enc);
5590       need_clip = masklen < 4;
5591       break;
5592     case T_LONG:
5593     case T_DOUBLE:
5594       vmovmskpd(tmp, mask, vec_enc);
5595       need_clip = masklen < 2;
5596       break;
5597     default: assert(false, "Unhandled type, %s", type2name(bt));
5598   }
5599 
5600   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5601   // operations needs to be clipped.
5602   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5603     // need_clip implies masklen < 32
5604     andq(tmp, (1 << masklen) - 1);
5605   }
5606 
5607   vector_mask_operation_helper(opc, dst, tmp, masklen);
5608 }
5609 
5610 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5611                                              Register rtmp2, int mask_len) {
5612   kmov(rtmp1, src);
5613   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5614   mov64(rtmp2, -1L);
5615   pextq(rtmp2, rtmp2, rtmp1);
5616   kmov(dst, rtmp2);
5617 }
5618 
5619 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5620                                                     XMMRegister mask, Register rtmp, Register rscratch,
5621                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5622                                                     int vec_enc) {
5623   assert(type2aelembytes(bt) >= 4, "");
5624   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5625   address compress_perm_table = nullptr;
5626   address expand_perm_table = nullptr;
5627   if (type2aelembytes(bt) == 8) {
5628     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5629     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5630     vmovmskpd(rtmp, mask, vec_enc);
5631   } else {
5632     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5633     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5634     vmovmskps(rtmp, mask, vec_enc);
5635   }
5636   shlq(rtmp, 5); // for 32 byte permute row.
5637   if (opcode == Op_CompressV) {
5638     lea(rscratch, ExternalAddress(compress_perm_table));
5639   } else {
5640     lea(rscratch, ExternalAddress(expand_perm_table));
5641   }
5642   addptr(rtmp, rscratch);
5643   vmovdqu(permv, Address(rtmp));
5644   vpermps(dst, permv, src, Assembler::AVX_256bit);
5645   vpxor(xtmp, xtmp, xtmp, vec_enc);
5646   // Blend the result with zero vector using permute mask, each column entry
5647   // in a permute table row contains either a valid permute index or a -1 (default)
5648   // value, this can potentially be used as a blending mask after
5649   // compressing/expanding the source vector lanes.
5650   vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv);
5651 }
5652 
5653 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5654                                                bool merge, BasicType bt, int vec_enc) {
5655   if (opcode == Op_CompressV) {
5656     switch(bt) {
5657     case T_BYTE:
5658       evpcompressb(dst, mask, src, merge, vec_enc);
5659       break;
5660     case T_CHAR:
5661     case T_SHORT:
5662       evpcompressw(dst, mask, src, merge, vec_enc);
5663       break;
5664     case T_INT:
5665       evpcompressd(dst, mask, src, merge, vec_enc);
5666       break;
5667     case T_FLOAT:
5668       evcompressps(dst, mask, src, merge, vec_enc);
5669       break;
5670     case T_LONG:
5671       evpcompressq(dst, mask, src, merge, vec_enc);
5672       break;
5673     case T_DOUBLE:
5674       evcompresspd(dst, mask, src, merge, vec_enc);
5675       break;
5676     default:
5677       fatal("Unsupported type %s", type2name(bt));
5678       break;
5679     }
5680   } else {
5681     assert(opcode == Op_ExpandV, "");
5682     switch(bt) {
5683     case T_BYTE:
5684       evpexpandb(dst, mask, src, merge, vec_enc);
5685       break;
5686     case T_CHAR:
5687     case T_SHORT:
5688       evpexpandw(dst, mask, src, merge, vec_enc);
5689       break;
5690     case T_INT:
5691       evpexpandd(dst, mask, src, merge, vec_enc);
5692       break;
5693     case T_FLOAT:
5694       evexpandps(dst, mask, src, merge, vec_enc);
5695       break;
5696     case T_LONG:
5697       evpexpandq(dst, mask, src, merge, vec_enc);
5698       break;
5699     case T_DOUBLE:
5700       evexpandpd(dst, mask, src, merge, vec_enc);
5701       break;
5702     default:
5703       fatal("Unsupported type %s", type2name(bt));
5704       break;
5705     }
5706   }
5707 }
5708 
5709 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5710                                            KRegister ktmp1, int vec_enc) {
5711   if (opcode == Op_SignumVD) {
5712     vsubpd(dst, zero, one, vec_enc);
5713     // if src < 0 ? -1 : 1
5714     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5715     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5716     // if src == NaN, -0.0 or 0.0 return src.
5717     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5718     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5719   } else {
5720     assert(opcode == Op_SignumVF, "");
5721     vsubps(dst, zero, one, vec_enc);
5722     // if src < 0 ? -1 : 1
5723     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5724     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5725     // if src == NaN, -0.0 or 0.0 return src.
5726     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5727     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5728   }
5729 }
5730 
5731 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5732                                           XMMRegister xtmp1, int vec_enc) {
5733   if (opcode == Op_SignumVD) {
5734     vsubpd(dst, zero, one, vec_enc);
5735     // if src < 0 ? -1 : 1
5736     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5737     // if src == NaN, -0.0 or 0.0 return src.
5738     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5739     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5740   } else {
5741     assert(opcode == Op_SignumVF, "");
5742     vsubps(dst, zero, one, vec_enc);
5743     // if src < 0 ? -1 : 1
5744     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5745     // if src == NaN, -0.0 or 0.0 return src.
5746     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5747     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5748   }
5749 }
5750 
5751 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5752   if (VM_Version::supports_avx512bw()) {
5753     if (mask_len > 32) {
5754       kmovql(dst, src);
5755     } else {
5756       kmovdl(dst, src);
5757       if (mask_len != 32) {
5758         kshiftrdl(dst, dst, 32 - mask_len);
5759       }
5760     }
5761   } else {
5762     assert(mask_len <= 16, "");
5763     kmovwl(dst, src);
5764     if (mask_len != 16) {
5765       kshiftrwl(dst, dst, 16 - mask_len);
5766     }
5767   }
5768 }
5769 
5770 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5771   int lane_size = type2aelembytes(bt);
5772   if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5773       (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) {
5774     movptr(rtmp, imm32);
5775     switch(lane_size) {
5776       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5777       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5778       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5779       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5780       fatal("Unsupported lane size %d", lane_size);
5781       break;
5782     }
5783   } else {
5784     movptr(rtmp, imm32);
5785     movq(dst, rtmp);
5786     switch(lane_size) {
5787       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5788       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5789       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5790       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5791       fatal("Unsupported lane size %d", lane_size);
5792       break;
5793     }
5794   }
5795 }
5796 
5797 //
5798 // Following is lookup table based popcount computation algorithm:-
5799 //       Index   Bit set count
5800 //     [ 0000 ->   0,
5801 //       0001 ->   1,
5802 //       0010 ->   1,
5803 //       0011 ->   2,
5804 //       0100 ->   1,
5805 //       0101 ->   2,
5806 //       0110 ->   2,
5807 //       0111 ->   3,
5808 //       1000 ->   1,
5809 //       1001 ->   2,
5810 //       1010 ->   3,
5811 //       1011 ->   3,
5812 //       1100 ->   2,
5813 //       1101 ->   3,
5814 //       1111 ->   4 ]
5815 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5816 //     shuffle indices for lookup table access.
5817 //  b. Right shift each byte of vector lane by 4 positions.
5818 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5819 //     shuffle indices for lookup table access.
5820 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5821 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5822 //     count of all the bytes of a quadword.
5823 //  f. Perform step e. for upper 128bit vector lane.
5824 //  g. Pack the bitset count of quadwords back to double word.
5825 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5826 
5827 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5828                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5829   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5830   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5831   vpsrlw(dst, src, 4, vec_enc);
5832   vpand(dst, dst, xtmp1, vec_enc);
5833   vpand(xtmp1, src, xtmp1, vec_enc);
5834   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5835   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5836   vpshufb(dst, xtmp2, dst, vec_enc);
5837   vpaddb(dst, dst, xtmp1, vec_enc);
5838 }
5839 
5840 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5841                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5842   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5843   // Following code is as per steps e,f,g and h of above algorithm.
5844   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5845   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5846   vpsadbw(dst, dst, xtmp2, vec_enc);
5847   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5848   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5849   vpackuswb(dst, xtmp1, dst, vec_enc);
5850 }
5851 
5852 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5853                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5854   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5855   // Add the popcount of upper and lower bytes of word.
5856   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5857   vpsrlw(dst, xtmp1, 8, vec_enc);
5858   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5859   vpaddw(dst, dst, xtmp1, vec_enc);
5860 }
5861 
5862 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5863                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5864   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5865   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5866   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5867 }
5868 
5869 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5870                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5871   switch(bt) {
5872     case T_LONG:
5873       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5874       break;
5875     case T_INT:
5876       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5877       break;
5878     case T_CHAR:
5879     case T_SHORT:
5880       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5881       break;
5882     case T_BYTE:
5883     case T_BOOLEAN:
5884       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5885       break;
5886     default:
5887       fatal("Unsupported type %s", type2name(bt));
5888       break;
5889   }
5890 }
5891 
5892 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5893                                                       KRegister mask, bool merge, int vec_enc) {
5894   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5895   switch(bt) {
5896     case T_LONG:
5897       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5898       evpopcntq(dst, mask, src, merge, vec_enc);
5899       break;
5900     case T_INT:
5901       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5902       evpopcntd(dst, mask, src, merge, vec_enc);
5903       break;
5904     case T_CHAR:
5905     case T_SHORT:
5906       assert(VM_Version::supports_avx512_bitalg(), "");
5907       evpopcntw(dst, mask, src, merge, vec_enc);
5908       break;
5909     case T_BYTE:
5910     case T_BOOLEAN:
5911       assert(VM_Version::supports_avx512_bitalg(), "");
5912       evpopcntb(dst, mask, src, merge, vec_enc);
5913       break;
5914     default:
5915       fatal("Unsupported type %s", type2name(bt));
5916       break;
5917   }
5918 }
5919 
5920 // Bit reversal algorithm first reverses the bits of each byte followed by
5921 // a byte level reversal for multi-byte primitive types (short/int/long).
5922 // Algorithm performs a lookup table access to get reverse bit sequence
5923 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5924 // is obtained by swapping the reverse bit sequences of upper and lower
5925 // nibble of a byte.
5926 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5927                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5928   if (VM_Version::supports_avx512vlbw()) {
5929 
5930     // Get the reverse bit sequence of lower nibble of each byte.
5931     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5932     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5933     evpandq(dst, xtmp2, src, vec_enc);
5934     vpshufb(dst, xtmp1, dst, vec_enc);
5935     vpsllq(dst, dst, 4, vec_enc);
5936 
5937     // Get the reverse bit sequence of upper nibble of each byte.
5938     vpandn(xtmp2, xtmp2, src, vec_enc);
5939     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5940     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5941 
5942     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5943     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5944     evporq(xtmp2, dst, xtmp2, vec_enc);
5945     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5946 
5947   } else if(vec_enc == Assembler::AVX_512bit) {
5948     // Shift based bit reversal.
5949     assert(bt == T_LONG || bt == T_INT, "");
5950 
5951     // Swap lower and upper nibble of each byte.
5952     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5953 
5954     // Swap two least and most significant bits of each nibble.
5955     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5956 
5957     // Swap adjacent pair of bits.
5958     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5959     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5960 
5961     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5962     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5963   } else {
5964     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5965     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5966 
5967     // Get the reverse bit sequence of lower nibble of each byte.
5968     vpand(dst, xtmp2, src, vec_enc);
5969     vpshufb(dst, xtmp1, dst, vec_enc);
5970     vpsllq(dst, dst, 4, vec_enc);
5971 
5972     // Get the reverse bit sequence of upper nibble of each byte.
5973     vpandn(xtmp2, xtmp2, src, vec_enc);
5974     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5975     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5976 
5977     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5978     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5979     vpor(xtmp2, dst, xtmp2, vec_enc);
5980     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5981   }
5982 }
5983 
5984 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5985                                                 XMMRegister xtmp, Register rscratch) {
5986   assert(VM_Version::supports_gfni(), "");
5987   assert(rscratch != noreg || always_reachable(mask), "missing");
5988 
5989   // Galois field instruction based bit reversal based on following algorithm.
5990   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5991   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5992   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5993   vector_reverse_byte(bt, dst, xtmp, vec_enc);
5994 }
5995 
5996 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5997                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
5998   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5999   evpandq(dst, xtmp1, src, vec_enc);
6000   vpsllq(dst, dst, nbits, vec_enc);
6001   vpandn(xtmp1, xtmp1, src, vec_enc);
6002   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
6003   evporq(dst, dst, xtmp1, vec_enc);
6004 }
6005 
6006 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6007                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
6008   // Shift based bit reversal.
6009   assert(VM_Version::supports_evex(), "");
6010   switch(bt) {
6011     case T_LONG:
6012       // Swap upper and lower double word of each quad word.
6013       evprorq(xtmp1, k0, src, 32, true, vec_enc);
6014       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
6015       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6016       break;
6017     case T_INT:
6018       // Swap upper and lower word of each double word.
6019       evprord(xtmp1, k0, src, 16, true, vec_enc);
6020       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6021       break;
6022     case T_CHAR:
6023     case T_SHORT:
6024       // Swap upper and lower byte of each word.
6025       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6026       break;
6027     case T_BYTE:
6028       evmovdquq(dst, k0, src, true, vec_enc);
6029       break;
6030     default:
6031       fatal("Unsupported type %s", type2name(bt));
6032       break;
6033   }
6034 }
6035 
6036 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6037   if (bt == T_BYTE) {
6038     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6039       evmovdquq(dst, k0, src, true, vec_enc);
6040     } else {
6041       vmovdqu(dst, src);
6042     }
6043     return;
6044   }
6045   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6046   // pre-computed shuffle indices.
6047   switch(bt) {
6048     case T_LONG:
6049       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6050       break;
6051     case T_INT:
6052       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6053       break;
6054     case T_CHAR:
6055     case T_SHORT:
6056       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6057       break;
6058     default:
6059       fatal("Unsupported type %s", type2name(bt));
6060       break;
6061   }
6062   vpshufb(dst, src, dst, vec_enc);
6063 }
6064 
6065 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6066                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6067                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6068   assert(is_integral_type(bt), "");
6069   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6070   assert(VM_Version::supports_avx512cd(), "");
6071   switch(bt) {
6072     case T_LONG:
6073       evplzcntq(dst, ktmp, src, merge, vec_enc);
6074       break;
6075     case T_INT:
6076       evplzcntd(dst, ktmp, src, merge, vec_enc);
6077       break;
6078     case T_SHORT:
6079       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6080       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6081       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6082       vpunpckhwd(dst, xtmp1, src, vec_enc);
6083       evplzcntd(dst, ktmp, dst, merge, vec_enc);
6084       vpackusdw(dst, xtmp2, dst, vec_enc);
6085       break;
6086     case T_BYTE:
6087       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6088       // accessing the lookup table.
6089       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6090       // accessing the lookup table.
6091       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6092       assert(VM_Version::supports_avx512bw(), "");
6093       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6094       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6095       vpand(xtmp2, dst, src, vec_enc);
6096       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6097       vpsrlw(xtmp3, src, 4, vec_enc);
6098       vpand(xtmp3, dst, xtmp3, vec_enc);
6099       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6100       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6101       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6102       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6103       break;
6104     default:
6105       fatal("Unsupported type %s", type2name(bt));
6106       break;
6107   }
6108 }
6109 
6110 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6111                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6112   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6113   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6114   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6115   // accessing the lookup table.
6116   vpand(dst, xtmp2, src, vec_enc);
6117   vpshufb(dst, xtmp1, dst, vec_enc);
6118   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6119   // accessing the lookup table.
6120   vpsrlw(xtmp3, src, 4, vec_enc);
6121   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6122   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6123   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6124   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6125   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6126   vpaddb(dst, dst, xtmp2, vec_enc);
6127   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6128 }
6129 
6130 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6131                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6132   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6133   // Add zero counts of lower byte and upper byte of a word if
6134   // upper byte holds a zero value.
6135   vpsrlw(xtmp3, src, 8, vec_enc);
6136   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6137   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6138   vpsllw(xtmp2, dst, 8, vec_enc);
6139   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6140   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6141   vpsrlw(dst, dst, 8, vec_enc);
6142 }
6143 
6144 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6145                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6146   // Since IEEE 754 floating point format represents mantissa in 1.0 format
6147   // hence biased exponent can be used to compute leading zero count as per
6148   // following formula:-
6149   // LZCNT = 31 - (biased_exp - 127)
6150   // Special handling has been introduced for Zero, Max_Int and -ve source values.
6151 
6152   // Broadcast 0xFF
6153   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6154   vpsrld(xtmp1, xtmp1, 24, vec_enc);
6155 
6156   // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher
6157   // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit
6158   // contributes to the leading number of zeros.
6159   vpsrld(xtmp2, src, 1, vec_enc);
6160   vpandn(xtmp3, xtmp2, src, vec_enc);
6161 
6162   // Extract biased exponent.
6163   vcvtdq2ps(dst, xtmp3, vec_enc);
6164   vpsrld(dst, dst, 23, vec_enc);
6165   vpand(dst, dst, xtmp1, vec_enc);
6166 
6167   // Broadcast 127.
6168   vpsrld(xtmp1, xtmp1, 1, vec_enc);
6169   // Exponent = biased_exp - 127
6170   vpsubd(dst, dst, xtmp1, vec_enc);
6171 
6172   // Exponent_plus_one = Exponent + 1
6173   vpsrld(xtmp3, xtmp1, 6, vec_enc);
6174   vpaddd(dst, dst, xtmp3, vec_enc);
6175 
6176   // Replace -ve exponent with zero, exponent is -ve when src
6177   // lane contains a zero value.
6178   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6179   vblendvps(dst, dst, xtmp2, dst, vec_enc);
6180 
6181   // Rematerialize broadcast 32.
6182   vpslld(xtmp1, xtmp3, 5, vec_enc);
6183   // Exponent is 32 if corresponding source lane contains max_int value.
6184   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
6185   // LZCNT = 32 - exponent_plus_one
6186   vpsubd(dst, xtmp1, dst, vec_enc);
6187 
6188   // Replace LZCNT with a value 1 if corresponding source lane
6189   // contains max_int value.
6190   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
6191 
6192   // Replace biased_exp with 0 if source lane value is less than zero.
6193   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6194   vblendvps(dst, dst, xtmp2, src, vec_enc);
6195 }
6196 
6197 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6198                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6199   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6200   // Add zero counts of lower word and upper word of a double word if
6201   // upper word holds a zero value.
6202   vpsrld(xtmp3, src, 16, vec_enc);
6203   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6204   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
6205   vpslld(xtmp2, dst, 16, vec_enc);
6206   vpaddd(xtmp2, xtmp2, dst, vec_enc);
6207   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6208   vpsrld(dst, dst, 16, vec_enc);
6209   // Add zero counts of lower doubleword and upper doubleword of a
6210   // quadword if upper doubleword holds a zero value.
6211   vpsrlq(xtmp3, src, 32, vec_enc);
6212   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
6213   vpsllq(xtmp2, dst, 32, vec_enc);
6214   vpaddq(xtmp2, xtmp2, dst, vec_enc);
6215   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6216   vpsrlq(dst, dst, 32, vec_enc);
6217 }
6218 
6219 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6220                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6221                                                        Register rtmp, int vec_enc) {
6222   assert(is_integral_type(bt), "unexpected type");
6223   assert(vec_enc < Assembler::AVX_512bit, "");
6224   switch(bt) {
6225     case T_LONG:
6226       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6227       break;
6228     case T_INT:
6229       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6230       break;
6231     case T_SHORT:
6232       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6233       break;
6234     case T_BYTE:
6235       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6236       break;
6237     default:
6238       fatal("Unsupported type %s", type2name(bt));
6239       break;
6240   }
6241 }
6242 
6243 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6244   switch(bt) {
6245     case T_BYTE:
6246       vpsubb(dst, src1, src2, vec_enc);
6247       break;
6248     case T_SHORT:
6249       vpsubw(dst, src1, src2, vec_enc);
6250       break;
6251     case T_INT:
6252       vpsubd(dst, src1, src2, vec_enc);
6253       break;
6254     case T_LONG:
6255       vpsubq(dst, src1, src2, vec_enc);
6256       break;
6257     default:
6258       fatal("Unsupported type %s", type2name(bt));
6259       break;
6260   }
6261 }
6262 
6263 // Trailing zero count computation is based on leading zero count operation as per
6264 // following equation. All AVX3 targets support AVX512CD feature which offers
6265 // direct vector instruction to compute leading zero count.
6266 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6267 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6268                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6269                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6270   assert(is_integral_type(bt), "");
6271   // xtmp = -1
6272   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6273   // xtmp = xtmp + src
6274   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6275   // xtmp = xtmp & ~src
6276   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6277   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6278   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6279   vpsub(bt, dst, xtmp4, dst, vec_enc);
6280 }
6281 
6282 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6283 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6284 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6285                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6286   assert(is_integral_type(bt), "");
6287   // xtmp = 0
6288   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6289   // xtmp = 0 - src
6290   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6291   // xtmp = xtmp | src
6292   vpor(xtmp3, xtmp3, src, vec_enc);
6293   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6294   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6295   vpsub(bt, dst, xtmp1, dst, vec_enc);
6296 }
6297 
6298 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6299   Label done;
6300   Label neg_divisor_fastpath;
6301   cmpl(divisor, 0);
6302   jccb(Assembler::less, neg_divisor_fastpath);
6303   xorl(rdx, rdx);
6304   divl(divisor);
6305   jmpb(done);
6306   bind(neg_divisor_fastpath);
6307   // Fastpath for divisor < 0:
6308   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6309   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6310   movl(rdx, rax);
6311   subl(rdx, divisor);
6312   if (VM_Version::supports_bmi1()) {
6313     andnl(rax, rdx, rax);
6314   } else {
6315     notl(rdx);
6316     andl(rax, rdx);
6317   }
6318   shrl(rax, 31);
6319   bind(done);
6320 }
6321 
6322 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6323   Label done;
6324   Label neg_divisor_fastpath;
6325   cmpl(divisor, 0);
6326   jccb(Assembler::less, neg_divisor_fastpath);
6327   xorl(rdx, rdx);
6328   divl(divisor);
6329   jmpb(done);
6330   bind(neg_divisor_fastpath);
6331   // Fastpath when divisor < 0:
6332   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6333   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6334   movl(rdx, rax);
6335   subl(rax, divisor);
6336   if (VM_Version::supports_bmi1()) {
6337     andnl(rax, rax, rdx);
6338   } else {
6339     notl(rax);
6340     andl(rax, rdx);
6341   }
6342   sarl(rax, 31);
6343   andl(rax, divisor);
6344   subl(rdx, rax);
6345   bind(done);
6346 }
6347 
6348 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6349   Label done;
6350   Label neg_divisor_fastpath;
6351 
6352   cmpl(divisor, 0);
6353   jccb(Assembler::less, neg_divisor_fastpath);
6354   xorl(rdx, rdx);
6355   divl(divisor);
6356   jmpb(done);
6357   bind(neg_divisor_fastpath);
6358   // Fastpath for divisor < 0:
6359   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6360   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6361   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6362   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6363   movl(rdx, rax);
6364   subl(rax, divisor);
6365   if (VM_Version::supports_bmi1()) {
6366     andnl(rax, rax, rdx);
6367   } else {
6368     notl(rax);
6369     andl(rax, rdx);
6370   }
6371   movl(tmp, rax);
6372   shrl(rax, 31); // quotient
6373   sarl(tmp, 31);
6374   andl(tmp, divisor);
6375   subl(rdx, tmp); // remainder
6376   bind(done);
6377 }
6378 
6379 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6380                                  XMMRegister xtmp2, Register rtmp) {
6381   if(VM_Version::supports_gfni()) {
6382     // Galois field instruction based bit reversal based on following algorithm.
6383     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6384     mov64(rtmp, 0x8040201008040201L);
6385     movq(xtmp1, src);
6386     movq(xtmp2, rtmp);
6387     gf2p8affineqb(xtmp1, xtmp2, 0);
6388     movq(dst, xtmp1);
6389   } else {
6390     // Swap even and odd numbered bits.
6391     movl(rtmp, src);
6392     andl(rtmp, 0x55555555);
6393     shll(rtmp, 1);
6394     movl(dst, src);
6395     andl(dst, 0xAAAAAAAA);
6396     shrl(dst, 1);
6397     orl(dst, rtmp);
6398 
6399     // Swap LSB and MSB 2 bits of each nibble.
6400     movl(rtmp, dst);
6401     andl(rtmp, 0x33333333);
6402     shll(rtmp, 2);
6403     andl(dst, 0xCCCCCCCC);
6404     shrl(dst, 2);
6405     orl(dst, rtmp);
6406 
6407     // Swap LSB and MSB 4 bits of each byte.
6408     movl(rtmp, dst);
6409     andl(rtmp, 0x0F0F0F0F);
6410     shll(rtmp, 4);
6411     andl(dst, 0xF0F0F0F0);
6412     shrl(dst, 4);
6413     orl(dst, rtmp);
6414   }
6415   bswapl(dst);
6416 }
6417 
6418 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6419                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6420   if(VM_Version::supports_gfni()) {
6421     // Galois field instruction based bit reversal based on following algorithm.
6422     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6423     mov64(rtmp1, 0x8040201008040201L);
6424     movq(xtmp1, src);
6425     movq(xtmp2, rtmp1);
6426     gf2p8affineqb(xtmp1, xtmp2, 0);
6427     movq(dst, xtmp1);
6428   } else {
6429     // Swap even and odd numbered bits.
6430     movq(rtmp1, src);
6431     mov64(rtmp2, 0x5555555555555555L);
6432     andq(rtmp1, rtmp2);
6433     shlq(rtmp1, 1);
6434     movq(dst, src);
6435     notq(rtmp2);
6436     andq(dst, rtmp2);
6437     shrq(dst, 1);
6438     orq(dst, rtmp1);
6439 
6440     // Swap LSB and MSB 2 bits of each nibble.
6441     movq(rtmp1, dst);
6442     mov64(rtmp2, 0x3333333333333333L);
6443     andq(rtmp1, rtmp2);
6444     shlq(rtmp1, 2);
6445     notq(rtmp2);
6446     andq(dst, rtmp2);
6447     shrq(dst, 2);
6448     orq(dst, rtmp1);
6449 
6450     // Swap LSB and MSB 4 bits of each byte.
6451     movq(rtmp1, dst);
6452     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6453     andq(rtmp1, rtmp2);
6454     shlq(rtmp1, 4);
6455     notq(rtmp2);
6456     andq(dst, rtmp2);
6457     shrq(dst, 4);
6458     orq(dst, rtmp1);
6459   }
6460   bswapq(dst);
6461 }
6462 
6463 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6464   Label done;
6465   Label neg_divisor_fastpath;
6466   cmpq(divisor, 0);
6467   jccb(Assembler::less, neg_divisor_fastpath);
6468   xorl(rdx, rdx);
6469   divq(divisor);
6470   jmpb(done);
6471   bind(neg_divisor_fastpath);
6472   // Fastpath for divisor < 0:
6473   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6474   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6475   movq(rdx, rax);
6476   subq(rdx, divisor);
6477   if (VM_Version::supports_bmi1()) {
6478     andnq(rax, rdx, rax);
6479   } else {
6480     notq(rdx);
6481     andq(rax, rdx);
6482   }
6483   shrq(rax, 63);
6484   bind(done);
6485 }
6486 
6487 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6488   Label done;
6489   Label neg_divisor_fastpath;
6490   cmpq(divisor, 0);
6491   jccb(Assembler::less, neg_divisor_fastpath);
6492   xorq(rdx, rdx);
6493   divq(divisor);
6494   jmp(done);
6495   bind(neg_divisor_fastpath);
6496   // Fastpath when divisor < 0:
6497   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6498   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6499   movq(rdx, rax);
6500   subq(rax, divisor);
6501   if (VM_Version::supports_bmi1()) {
6502     andnq(rax, rax, rdx);
6503   } else {
6504     notq(rax);
6505     andq(rax, rdx);
6506   }
6507   sarq(rax, 63);
6508   andq(rax, divisor);
6509   subq(rdx, rax);
6510   bind(done);
6511 }
6512 
6513 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6514   Label done;
6515   Label neg_divisor_fastpath;
6516   cmpq(divisor, 0);
6517   jccb(Assembler::less, neg_divisor_fastpath);
6518   xorq(rdx, rdx);
6519   divq(divisor);
6520   jmp(done);
6521   bind(neg_divisor_fastpath);
6522   // Fastpath for divisor < 0:
6523   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6524   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6525   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6526   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6527   movq(rdx, rax);
6528   subq(rax, divisor);
6529   if (VM_Version::supports_bmi1()) {
6530     andnq(rax, rax, rdx);
6531   } else {
6532     notq(rax);
6533     andq(rax, rdx);
6534   }
6535   movq(tmp, rax);
6536   shrq(rax, 63); // quotient
6537   sarq(tmp, 63);
6538   andq(tmp, divisor);
6539   subq(rdx, tmp); // remainder
6540   bind(done);
6541 }
6542 
6543 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6544                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6545                                         int vlen_enc) {
6546   assert(VM_Version::supports_avx512bw(), "");
6547   // Byte shuffles are inlane operations and indices are determined using
6548   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6549   // normalized to index range 0-15. This makes sure that all the multiples
6550   // of an index value are placed at same relative position in 128 bit
6551   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6552   // will be 16th element in their respective 128 bit lanes.
6553   movl(rtmp, 16);
6554   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6555 
6556   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6557   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6558   // original shuffle indices and move the shuffled lanes corresponding to true
6559   // mask to destination vector.
6560   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6561   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6562   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6563 
6564   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6565   // and broadcasting second 128 bit lane.
6566   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6567   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6568   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6569   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6570   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6571 
6572   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6573   // and broadcasting third 128 bit lane.
6574   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6575   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6576   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6577   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6578   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6579 
6580   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6581   // and broadcasting third 128 bit lane.
6582   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6583   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6584   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6585   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6586   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6587 }
6588 
6589 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6590                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6591   if (vlen_enc == AVX_128bit) {
6592     vpermilps(dst, src, shuffle, vlen_enc);
6593   } else if (bt == T_INT) {
6594     vpermd(dst, shuffle, src, vlen_enc);
6595   } else {
6596     assert(bt == T_FLOAT, "");
6597     vpermps(dst, shuffle, src, vlen_enc);
6598   }
6599 }
6600 
6601 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6602   switch(opcode) {
6603     case Op_AddHF: vaddsh(dst, src1, src2); break;
6604     case Op_SubHF: vsubsh(dst, src1, src2); break;
6605     case Op_MulHF: vmulsh(dst, src1, src2); break;
6606     case Op_DivHF: vdivsh(dst, src1, src2); break;
6607     default: assert(false, "%s", NodeClassNames[opcode]); break;
6608   }
6609 }
6610 
6611 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6612   switch(elem_bt) {
6613     case T_BYTE:
6614       if (ideal_opc == Op_SaturatingAddV) {
6615         vpaddsb(dst, src1, src2, vlen_enc);
6616       } else {
6617         assert(ideal_opc == Op_SaturatingSubV, "");
6618         vpsubsb(dst, src1, src2, vlen_enc);
6619       }
6620       break;
6621     case T_SHORT:
6622       if (ideal_opc == Op_SaturatingAddV) {
6623         vpaddsw(dst, src1, src2, vlen_enc);
6624       } else {
6625         assert(ideal_opc == Op_SaturatingSubV, "");
6626         vpsubsw(dst, src1, src2, vlen_enc);
6627       }
6628       break;
6629     default:
6630       fatal("Unsupported type %s", type2name(elem_bt));
6631       break;
6632   }
6633 }
6634 
6635 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6636   switch(elem_bt) {
6637     case T_BYTE:
6638       if (ideal_opc == Op_SaturatingAddV) {
6639         vpaddusb(dst, src1, src2, vlen_enc);
6640       } else {
6641         assert(ideal_opc == Op_SaturatingSubV, "");
6642         vpsubusb(dst, src1, src2, vlen_enc);
6643       }
6644       break;
6645     case T_SHORT:
6646       if (ideal_opc == Op_SaturatingAddV) {
6647         vpaddusw(dst, src1, src2, vlen_enc);
6648       } else {
6649         assert(ideal_opc == Op_SaturatingSubV, "");
6650         vpsubusw(dst, src1, src2, vlen_enc);
6651       }
6652       break;
6653     default:
6654       fatal("Unsupported type %s", type2name(elem_bt));
6655       break;
6656   }
6657 }
6658 
6659 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6660                                                               XMMRegister src2, KRegister ktmp, int vlen_enc) {
6661   // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6662   // overflow_mask = Inp1 <u Inp2
6663   evpcmpu(elem_bt, ktmp,  src2, src1, Assembler::lt, vlen_enc);
6664   // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6665   evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6666 }
6667 
6668 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6669                                                               XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6670   // Emulate unsigned comparison using signed comparison
6671   // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6672   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6673   vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6674   vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6675 
6676   vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6677 
6678   // Res = INP1 - INP2 (non-commutative and non-associative)
6679   vpsub(elem_bt, dst, src1, src2, vlen_enc);
6680   // Res = Mask ? Zero : Res
6681   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6682   vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6683 }
6684 
6685 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6686                                                                XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6687   // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6688   // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6689   // Res = Signed Add INP1, INP2
6690   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6691   // T1 = SRC1 | SRC2
6692   vpor(xtmp1, src1, src2, vlen_enc);
6693   // Max_Unsigned = -1
6694   vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6695   // Unsigned compare:  Mask = Res <u T1
6696   evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6697   // res  = Mask ? Max_Unsigned : Res
6698   evpblend(elem_bt, dst, ktmp,  dst, xtmp2, true, vlen_enc);
6699 }
6700 
6701 //
6702 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6703 // unsigned addition operation.
6704 //    overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6705 //
6706 // We empirically determined its semantic equivalence to following reduced expression
6707 //    overflow_mask =  (a + b) <u (a | b)
6708 //
6709 // and also verified it though Alive2 solver.
6710 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6711 //
6712 
6713 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6714                                                               XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6715   // Res = Signed Add INP1, INP2
6716   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6717   // Compute T1 = INP1 | INP2
6718   vpor(xtmp3, src1, src2, vlen_enc);
6719   // T1 = Minimum signed value.
6720   vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6721   // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6722   vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6723   // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6724   vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6725   // Compute overflow detection mask = Res<1> <s T1
6726   if (elem_bt == T_INT) {
6727     vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6728   } else {
6729     assert(elem_bt == T_LONG, "");
6730     vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6731   }
6732   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6733 }
6734 
6735 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6736                                       int vlen_enc, bool xtmp2_hold_M1) {
6737   if (VM_Version::supports_avx512dq()) {
6738     evpmovq2m(ktmp, src, vlen_enc);
6739   } else {
6740     assert(VM_Version::supports_evex(), "");
6741     if (!xtmp2_hold_M1) {
6742       vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6743     }
6744     evpsraq(xtmp1, src, 63, vlen_enc);
6745     evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6746   }
6747 }
6748 
6749 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6750                                       int vlen_enc, bool xtmp2_hold_M1) {
6751   if (VM_Version::supports_avx512dq()) {
6752     evpmovd2m(ktmp, src, vlen_enc);
6753   } else {
6754     assert(VM_Version::supports_evex(), "");
6755     if (!xtmp2_hold_M1) {
6756       vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6757     }
6758     vpsrad(xtmp1, src, 31, vlen_enc);
6759     Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6760   }
6761 }
6762 
6763 
6764 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6765   if (elem_bt == T_LONG) {
6766     if (VM_Version::supports_evex()) {
6767       evpsraq(dst, src, 63, vlen_enc);
6768     } else {
6769       vpsrad(dst, src, 31, vlen_enc);
6770       vpshufd(dst, dst, 0xF5, vlen_enc);
6771     }
6772   } else {
6773     assert(elem_bt == T_INT, "");
6774     vpsrad(dst, src, 31, vlen_enc);
6775   }
6776 }
6777 
6778 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6779   if (compute_allones) {
6780     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6781       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6782     } else {
6783       vpcmpeqq(allones, allones, allones, vlen_enc);
6784     }
6785   }
6786   if (elem_bt == T_LONG) {
6787     vpsrlq(dst, allones, 1, vlen_enc);
6788   } else {
6789     assert(elem_bt == T_INT, "");
6790     vpsrld(dst, allones, 1, vlen_enc);
6791   }
6792 }
6793 
6794 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6795   if (compute_allones) {
6796     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6797       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6798     } else {
6799       vpcmpeqq(allones, allones, allones, vlen_enc);
6800     }
6801   }
6802   if (elem_bt == T_LONG) {
6803     vpsllq(dst, allones, 63, vlen_enc);
6804   } else {
6805     assert(elem_bt == T_INT, "");
6806     vpslld(dst, allones, 31, vlen_enc);
6807   }
6808 }
6809 
6810 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask,  XMMRegister src1, XMMRegister src2,
6811                                 Assembler::ComparisonPredicate cond, int vlen_enc) {
6812   switch(elem_bt) {
6813     case T_LONG:  evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6814     case T_INT:   evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6815     case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6816     case T_BYTE:  evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6817     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6818   }
6819 }
6820 
6821 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6822   switch(elem_bt) {
6823     case  T_LONG:  vpcmpgtq(dst, src1, src2, vlen_enc); break;
6824     case  T_INT:   vpcmpgtd(dst, src1, src2, vlen_enc); break;
6825     case  T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6826     case  T_BYTE:  vpcmpgtb(dst, src1, src2, vlen_enc); break;
6827     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6828   }
6829 }
6830 
6831 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6832                                            XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6833   if (elem_bt == T_LONG) {
6834     evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6835   } else {
6836     assert(elem_bt == T_INT, "");
6837     evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6838   }
6839 }
6840 
6841 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6842                                                          XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6843                                                          KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6844   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6845   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6846   // Overflow detection based on Hacker's delight section 2-13.
6847   if (ideal_opc == Op_SaturatingAddV) {
6848     // res = src1 + src2
6849     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6850     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6851     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6852     vpxor(xtmp1, dst, src1, vlen_enc);
6853     vpxor(xtmp2, dst, src2, vlen_enc);
6854     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6855   } else {
6856     assert(ideal_opc == Op_SaturatingSubV, "");
6857     // res = src1 - src2
6858     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6859     // Overflow occurs when both inputs have opposite polarity and
6860     // result polarity does not comply with first input polarity.
6861     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6862     vpxor(xtmp1, src1, src2, vlen_enc);
6863     vpxor(xtmp2, dst, src1, vlen_enc);
6864     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6865   }
6866 
6867   // Compute overflow detection mask.
6868   evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6869   // Note: xtmp1 hold -1 in all its lanes after above call.
6870 
6871   // Compute mask based on first input polarity.
6872   evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6873 
6874   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6875   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6876 
6877   // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6878   // set bits in first input polarity mask holds a min value.
6879   evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6880   // Blend destination lanes with saturated values using overflow detection mask.
6881   evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6882 }
6883 
6884 
6885 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6886                                                         XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6887                                                         XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6888   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6889   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6890   // Overflow detection based on Hacker's delight section 2-13.
6891   if (ideal_opc == Op_SaturatingAddV) {
6892     // res = src1 + src2
6893     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6894     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6895     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6896     vpxor(xtmp1, dst, src1, vlen_enc);
6897     vpxor(xtmp2, dst, src2, vlen_enc);
6898     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6899   } else {
6900     assert(ideal_opc == Op_SaturatingSubV, "");
6901     // res = src1 - src2
6902     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6903     // Overflow occurs when both inputs have opposite polarity and
6904     // result polarity does not comply with first input polarity.
6905     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6906     vpxor(xtmp1, src1, src2, vlen_enc);
6907     vpxor(xtmp2, dst, src1, vlen_enc);
6908     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6909   }
6910 
6911   // Sign-extend to compute overflow detection mask.
6912   vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
6913 
6914   vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
6915   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
6916   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6917 
6918   // Compose saturating min/max vector using first input polarity mask.
6919   vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
6920   vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
6921 
6922   // Blend result with saturating vector using overflow detection mask.
6923   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6924 }
6925 
6926 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6927   switch(elem_bt) {
6928     case T_BYTE:
6929       if (ideal_opc == Op_SaturatingAddV) {
6930         vpaddsb(dst, src1, src2, vlen_enc);
6931       } else {
6932         assert(ideal_opc == Op_SaturatingSubV, "");
6933         vpsubsb(dst, src1, src2, vlen_enc);
6934       }
6935       break;
6936     case T_SHORT:
6937       if (ideal_opc == Op_SaturatingAddV) {
6938         vpaddsw(dst, src1, src2, vlen_enc);
6939       } else {
6940         assert(ideal_opc == Op_SaturatingSubV, "");
6941         vpsubsw(dst, src1, src2, vlen_enc);
6942       }
6943       break;
6944     default:
6945       fatal("Unsupported type %s", type2name(elem_bt));
6946       break;
6947   }
6948 }
6949 
6950 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6951   switch(elem_bt) {
6952     case T_BYTE:
6953       if (ideal_opc == Op_SaturatingAddV) {
6954         vpaddusb(dst, src1, src2, vlen_enc);
6955       } else {
6956         assert(ideal_opc == Op_SaturatingSubV, "");
6957         vpsubusb(dst, src1, src2, vlen_enc);
6958       }
6959       break;
6960     case T_SHORT:
6961       if (ideal_opc == Op_SaturatingAddV) {
6962         vpaddusw(dst, src1, src2, vlen_enc);
6963       } else {
6964         assert(ideal_opc == Op_SaturatingSubV, "");
6965         vpsubusw(dst, src1, src2, vlen_enc);
6966       }
6967       break;
6968     default:
6969       fatal("Unsupported type %s", type2name(elem_bt));
6970       break;
6971   }
6972 }
6973 
6974 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6975                                                      XMMRegister src2, int vlen_enc) {
6976   switch(elem_bt) {
6977     case T_BYTE:
6978       evpermi2b(dst, src1, src2, vlen_enc);
6979       break;
6980     case T_SHORT:
6981       evpermi2w(dst, src1, src2, vlen_enc);
6982       break;
6983     case T_INT:
6984       evpermi2d(dst, src1, src2, vlen_enc);
6985       break;
6986     case T_LONG:
6987       evpermi2q(dst, src1, src2, vlen_enc);
6988       break;
6989     case T_FLOAT:
6990       evpermi2ps(dst, src1, src2, vlen_enc);
6991       break;
6992     case T_DOUBLE:
6993       evpermi2pd(dst, src1, src2, vlen_enc);
6994       break;
6995     default:
6996       fatal("Unsupported type %s", type2name(elem_bt));
6997       break;
6998   }
6999 }
7000 
7001 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
7002   if (is_unsigned) {
7003     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7004   } else {
7005     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7006   }
7007 }
7008 
7009 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
7010   if (is_unsigned) {
7011     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7012   } else {
7013     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7014   }
7015 }
7016 
7017 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
7018   switch(opcode) {
7019     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7020     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7021     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7022     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7023     default: assert(false, "%s", NodeClassNames[opcode]); break;
7024   }
7025 }
7026 
7027 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7028   switch(opcode) {
7029     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7030     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7031     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7032     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7033     default: assert(false, "%s", NodeClassNames[opcode]); break;
7034   }
7035 }
7036 
7037 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7038                                             KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) {
7039   vector_max_min_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit);
7040 }
7041 
7042 void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7043                                             KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
7044   if (opcode == Op_MaxVHF || opcode == Op_MaxHF) {
7045     // Move sign bits of src2 to mask register.
7046     evpmovw2m(ktmp, src2, vlen_enc);
7047     // xtmp1 = src2 < 0 ? src2 : src1
7048     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7049     // xtmp2 = src2 < 0 ? ? src1 : src2
7050     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7051     // Idea behind above swapping is to make seconds source operand a +ve value.
7052     // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in
7053     // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction,
7054     // the second source operand, either a NaN or a valid floating-point value, is returned
7055     // dst = max(xtmp1, xtmp2)
7056     evmaxph(dst, xtmp1, xtmp2, vlen_enc);
7057     // isNaN = is_unordered_quiet(xtmp1)
7058     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7059     // Final result is same as first source if its a NaN value,
7060     // in case second operand holds a NaN value then as per above semantics
7061     // result is same as second operand.
7062     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7063   } else {
7064     assert(opcode == Op_MinVHF || opcode == Op_MinHF, "");
7065     // Move sign bits of src1 to mask register.
7066     evpmovw2m(ktmp, src1, vlen_enc);
7067     // xtmp1 = src1 < 0 ? src2 : src1
7068     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7069     // xtmp2 = src1 < 0 ? src1 : src2
7070     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7071     // Idea behind above swapping is to make seconds source operand a -ve value.
7072     // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in
7073     // the second source operand is returned.
7074     // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN
7075     // or a valid floating-point value, is written to the result.
7076     // dst = min(xtmp1, xtmp2)
7077     evminph(dst, xtmp1, xtmp2, vlen_enc);
7078     // isNaN = is_unordered_quiet(xtmp1)
7079     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7080     // Final result is same as first source if its a NaN value,
7081     // in case second operand holds a NaN value then as per above semantics
7082     // result is same as second operand.
7083     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7084   }
7085 }