1 /*
   2  * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "gc/shared/barrierSet.hpp"
  28 #include "gc/shared/barrierSetAssembler.hpp"
  29 #include "oops/methodData.hpp"
  30 #include "opto/c2_MacroAssembler.hpp"
  31 #include "opto/intrinsicnode.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/opcodes.hpp"
  34 #include "opto/subnode.hpp"
  35 #include "runtime/globals.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 #include "utilities/checkedCast.hpp"
  39 #include "utilities/globalDefinitions.hpp"
  40 #include "utilities/powerOfTwo.hpp"
  41 #include "utilities/sizes.hpp"
  42 
  43 #ifdef PRODUCT
  44 #define BLOCK_COMMENT(str) /* nothing */
  45 #define STOP(error) stop(error)
  46 #else
  47 #define BLOCK_COMMENT(str) block_comment(str)
  48 #define STOP(error) block_comment(error); stop(error)
  49 #endif
  50 
  51 // C2 compiled method's prolog code.
  52 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
  53   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  54 
  55   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  56   // Remove word for return addr
  57   framesize -= wordSize;
  58   stack_bang_size -= wordSize;
  59 
  60   // Calls to C2R adapters often do not accept exceptional returns.
  61   // We require that their callers must bang for them.  But be careful, because
  62   // some VM calls (such as call site linkage) can use several kilobytes of
  63   // stack.  But the stack safety zone should account for that.
  64   // See bugs 4446381, 4468289, 4497237.
  65   if (stack_bang_size > 0) {
  66     generate_stack_overflow_check(stack_bang_size);
  67 
  68     // We always push rbp, so that on return to interpreter rbp, will be
  69     // restored correctly and we can correct the stack.
  70     push(rbp);
  71     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  72     if (PreserveFramePointer) {
  73       mov(rbp, rsp);
  74     }
  75     // Remove word for ebp
  76     framesize -= wordSize;
  77 
  78     // Create frame
  79     if (framesize) {
  80       subptr(rsp, framesize);
  81     }
  82   } else {
  83     subptr(rsp, framesize);
  84 
  85     // Save RBP register now.
  86     framesize -= wordSize;
  87     movptr(Address(rsp, framesize), rbp);
  88     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  89     if (PreserveFramePointer) {
  90       movptr(rbp, rsp);
  91       if (framesize > 0) {
  92         addptr(rbp, framesize);
  93       }
  94     }
  95   }
  96 
  97   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
  98     framesize -= wordSize;
  99     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 100   }
 101 
 102 #ifdef ASSERT
 103   if (VerifyStackAtCalls) {
 104     Label L;
 105     push(rax);
 106     mov(rax, rsp);
 107     andptr(rax, StackAlignmentInBytes-1);
 108     cmpptr(rax, StackAlignmentInBytes-wordSize);
 109     pop(rax);
 110     jcc(Assembler::equal, L);
 111     STOP("Stack is not properly aligned!");
 112     bind(L);
 113   }
 114 #endif
 115 
 116   if (!is_stub) {
 117     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 118     // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 119     Label dummy_slow_path;
 120     Label dummy_continuation;
 121     Label* slow_path = &dummy_slow_path;
 122     Label* continuation = &dummy_continuation;
 123     if (!Compile::current()->output()->in_scratch_emit_size()) {
 124       // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 125       C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 126       Compile::current()->output()->add_stub(stub);
 127       slow_path = &stub->entry();
 128       continuation = &stub->continuation();
 129     }
 130     bs->nmethod_entry_barrier(this, slow_path, continuation);
 131   }
 132 }
 133 
 134 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 135   switch (vlen_in_bytes) {
 136     case  4: // fall-through
 137     case  8: // fall-through
 138     case 16: return Assembler::AVX_128bit;
 139     case 32: return Assembler::AVX_256bit;
 140     case 64: return Assembler::AVX_512bit;
 141 
 142     default: {
 143       ShouldNotReachHere();
 144       return Assembler::AVX_NoVec;
 145     }
 146   }
 147 }
 148 
 149 // fast_lock and fast_unlock used by C2
 150 
 151 // Because the transitions from emitted code to the runtime
 152 // monitorenter/exit helper stubs are so slow it's critical that
 153 // we inline both the stack-locking fast path and the inflated fast path.
 154 //
 155 // See also: cmpFastLock and cmpFastUnlock.
 156 //
 157 // What follows is a specialized inline transliteration of the code
 158 // in enter() and exit(). If we're concerned about I$ bloat another
 159 // option would be to emit TrySlowEnter and TrySlowExit methods
 160 // at startup-time.  These methods would accept arguments as
 161 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 162 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 163 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 164 // In practice, however, the # of lock sites is bounded and is usually small.
 165 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 166 // if the processor uses simple bimodal branch predictors keyed by EIP
 167 // Since the helper routines would be called from multiple synchronization
 168 // sites.
 169 //
 170 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 171 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 172 // to those specialized methods.  That'd give us a mostly platform-independent
 173 // implementation that the JITs could optimize and inline at their pleasure.
 174 // Done correctly, the only time we'd need to cross to native could would be
 175 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 176 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 177 // (b) explicit barriers or fence operations.
 178 //
 179 // TODO:
 180 //
 181 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 182 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 183 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 184 //    the lock operators would typically be faster than reifying Self.
 185 //
 186 // *  Ideally I'd define the primitives as:
 187 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 188 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 189 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 190 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 191 //    Furthermore the register assignments are overconstrained, possibly resulting in
 192 //    sub-optimal code near the synchronization site.
 193 //
 194 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 195 //    Alternately, use a better sp-proximity test.
 196 //
 197 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 198 //    Either one is sufficient to uniquely identify a thread.
 199 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 200 //
 201 // *  Intrinsify notify() and notifyAll() for the common cases where the
 202 //    object is locked by the calling thread but the waitlist is empty.
 203 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 204 //
 205 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 206 //    But beware of excessive branch density on AMD Opterons.
 207 //
 208 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 209 //    or failure of the fast path.  If the fast path fails then we pass
 210 //    control to the slow path, typically in C.  In fast_lock and
 211 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 212 //    will emit a conditional branch immediately after the node.
 213 //    So we have branches to branches and lots of ICC.ZF games.
 214 //    Instead, it might be better to have C2 pass a "FailureLabel"
 215 //    into fast_lock and fast_unlock.  In the case of success, control
 216 //    will drop through the node.  ICC.ZF is undefined at exit.
 217 //    In the case of failure, the node will branch directly to the
 218 //    FailureLabel
 219 
 220 
 221 // obj: object to lock
 222 // box: on-stack box address (displaced header location) - KILLED
 223 // rax,: tmp -- KILLED
 224 // scr: tmp -- KILLED
 225 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 226                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 227                                  Metadata* method_data) {
 228   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 229   // Ensure the register assignments are disjoint
 230   assert(tmpReg == rax, "");
 231   assert(cx1Reg == noreg, "");
 232   assert(cx2Reg == noreg, "");
 233   assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 234 
 235   // Possible cases that we'll encounter in fast_lock
 236   // ------------------------------------------------
 237   // * Inflated
 238   //    -- unlocked
 239   //    -- Locked
 240   //       = by self
 241   //       = by other
 242   // * neutral
 243   // * stack-locked
 244   //    -- by self
 245   //       = sp-proximity test hits
 246   //       = sp-proximity test generates false-negative
 247   //    -- by other
 248   //
 249 
 250   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 251 
 252   if (DiagnoseSyncOnValueBasedClasses != 0) {
 253     load_klass(tmpReg, objReg, scrReg);
 254     testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 255     jcc(Assembler::notZero, DONE_LABEL);
 256   }
 257 
 258   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 259   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 260   jcc(Assembler::notZero, IsInflated);
 261 
 262   if (LockingMode == LM_MONITOR) {
 263     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 264     testptr(objReg, objReg);
 265   } else {
 266     assert(LockingMode == LM_LEGACY, "must be");
 267     // Attempt stack-locking ...
 268     orptr (tmpReg, markWord::unlocked_value);
 269     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 270     lock();
 271     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 272     jcc(Assembler::equal, COUNT);           // Success
 273 
 274     // Recursive locking.
 275     // The object is stack-locked: markword contains stack pointer to BasicLock.
 276     // Locked by current thread if difference with current SP is less than one page.
 277     subptr(tmpReg, rsp);
 278     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 279     andptr(tmpReg, (int32_t) (7 - (int)os::vm_page_size()) );
 280     movptr(Address(boxReg, 0), tmpReg);
 281   }
 282   jmp(DONE_LABEL);
 283 
 284   bind(IsInflated);
 285   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 286 
 287   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 288   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 289   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 290 
 291   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 292   movptr(boxReg, Address(r15_thread, JavaThread::monitor_owner_id_offset()));
 293   movq(scrReg, tmpReg);
 294   xorq(tmpReg, tmpReg);
 295   lock();
 296   cmpxchgptr(boxReg, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 297 
 298   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 299   jccb(Assembler::equal, COUNT);    // CAS above succeeded; propagate ZF = 1 (success)
 300 
 301   cmpptr(boxReg, rax);                // Check if we are already the owner (recursive lock)
 302   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 303   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 304   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 305   bind(DONE_LABEL);
 306 
 307   // ZFlag == 1 count in fast path
 308   // ZFlag == 0 count in slow path
 309   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 310 
 311   bind(COUNT);
 312   if (LockingMode == LM_LEGACY) {
 313     // Count monitors in fast path
 314     increment(Address(thread, JavaThread::held_monitor_count_offset()));
 315   }
 316   xorl(tmpReg, tmpReg); // Set ZF == 1
 317 
 318   bind(NO_COUNT);
 319 
 320   // At NO_COUNT the icc ZFlag is set as follows ...
 321   // fast_unlock uses the same protocol.
 322   // ZFlag == 1 -> Success
 323   // ZFlag == 0 -> Failure - force control through the slow path
 324 }
 325 
 326 // obj: object to unlock
 327 // box: box address (displaced header location), killed.  Must be EAX.
 328 // tmp: killed, cannot be obj nor box.
 329 //
 330 // Some commentary on balanced locking:
 331 //
 332 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 333 // Methods that don't have provably balanced locking are forced to run in the
 334 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 335 // The interpreter provides two properties:
 336 // I1:  At return-time the interpreter automatically and quietly unlocks any
 337 //      objects acquired the current activation (frame).  Recall that the
 338 //      interpreter maintains an on-stack list of locks currently held by
 339 //      a frame.
 340 // I2:  If a method attempts to unlock an object that is not held by the
 341 //      the frame the interpreter throws IMSX.
 342 //
 343 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 344 // B() doesn't have provably balanced locking so it runs in the interpreter.
 345 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 346 // is still locked by A().
 347 //
 348 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 349 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 350 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 351 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 352 // Arguably given that the spec legislates the JNI case as undefined our implementation
 353 // could reasonably *avoid* checking owner in fast_unlock().
 354 // In the interest of performance we elide m->Owner==Self check in unlock.
 355 // A perfectly viable alternative is to elide the owner check except when
 356 // Xcheck:jni is enabled.
 357 
 358 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) {
 359   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 360   assert(boxReg == rax, "");
 361   assert_different_registers(objReg, boxReg, tmpReg);
 362 
 363   Label DONE_LABEL, Stacked, COUNT, NO_COUNT;
 364 
 365   if (LockingMode == LM_LEGACY) {
 366     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 367     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 368   }
 369   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 370   if (LockingMode != LM_MONITOR) {
 371     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 372     jcc(Assembler::zero, Stacked);
 373   }
 374 
 375   // It's inflated.
 376 
 377   // Despite our balanced locking property we still check that m->_owner == Self
 378   // as java routines or native JNI code called by this thread might
 379   // have released the lock.
 380   //
 381   // If there's no contention try a 1-0 exit.  That is, exit without
 382   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 383   // we detect and recover from the race that the 1-0 exit admits.
 384   //
 385   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 386   // before it STs null into _owner, releasing the lock.  Updates
 387   // to data protected by the critical section must be visible before
 388   // we drop the lock (and thus before any other thread could acquire
 389   // the lock and observe the fields protected by the lock).
 390   // IA32's memory-model is SPO, so STs are ordered with respect to
 391   // each other and there's no need for an explicit barrier (fence).
 392   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 393   Label LSuccess, LNotRecursive;
 394 
 395   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 396   jccb(Assembler::equal, LNotRecursive);
 397 
 398   // Recursive inflated unlock
 399   decrement(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 400   jmpb(LSuccess);
 401 
 402   bind(LNotRecursive);
 403 
 404   // Set owner to null.
 405   // Release to satisfy the JMM
 406   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 407   // We need a full fence after clearing owner to avoid stranding.
 408   // StoreLoad achieves this.
 409   membar(StoreLoad);
 410 
 411   // Check if the entry_list is empty.
 412   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(entry_list)), NULL_WORD);
 413   jccb(Assembler::zero, LSuccess);    // If so we are done.
 414 
 415   // Check if there is a successor.
 416   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 417   jccb(Assembler::notZero, LSuccess); // If so we are done.
 418 
 419   // Save the monitor pointer in the current thread, so we can try to
 420   // reacquire the lock in SharedRuntime::monitor_exit_helper().
 421   andptr(tmpReg, ~(int32_t)markWord::monitor_value);
 422   movptr(Address(r15_thread, JavaThread::unlocked_inflated_monitor_offset()), tmpReg);
 423 
 424   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 425   jmpb  (DONE_LABEL);
 426 
 427   bind  (LSuccess);
 428   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 429   jmpb  (DONE_LABEL);
 430 
 431   if (LockingMode == LM_LEGACY) {
 432     bind  (Stacked);
 433     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 434     lock();
 435     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 436     // Intentional fall-thru into DONE_LABEL
 437   }
 438 
 439   bind(DONE_LABEL);
 440 
 441   // ZFlag == 1 count in fast path
 442   // ZFlag == 0 count in slow path
 443   jccb(Assembler::notZero, NO_COUNT);
 444 
 445   bind(COUNT);
 446 
 447   if (LockingMode == LM_LEGACY) {
 448     // Count monitors in fast path
 449     decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 450   }
 451 
 452   xorl(tmpReg, tmpReg); // Set ZF == 1
 453 
 454   bind(NO_COUNT);
 455 }
 456 
 457 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
 458                                               Register t, Register thread) {
 459   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 460   assert(rax_reg == rax, "Used for CAS");
 461   assert_different_registers(obj, box, rax_reg, t, thread);
 462 
 463   // Handle inflated monitor.
 464   Label inflated;
 465   // Finish fast lock successfully. ZF value is irrelevant.
 466   Label locked;
 467   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 468   Label slow_path;
 469 
 470   if (UseObjectMonitorTable) {
 471     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 472     movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
 473   }
 474 
 475   if (DiagnoseSyncOnValueBasedClasses != 0) {
 476     load_klass(rax_reg, obj, t);
 477     testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 478     jcc(Assembler::notZero, slow_path);
 479   }
 480 
 481   const Register mark = t;
 482 
 483   { // Lightweight Lock
 484 
 485     Label push;
 486 
 487     const Register top = UseObjectMonitorTable ? rax_reg : box;
 488 
 489     // Load the mark.
 490     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 491 
 492     // Prefetch top.
 493     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 494 
 495     // Check for monitor (0b10).
 496     testptr(mark, markWord::monitor_value);
 497     jcc(Assembler::notZero, inflated);
 498 
 499     // Check if lock-stack is full.
 500     cmpl(top, LockStack::end_offset() - 1);
 501     jcc(Assembler::greater, slow_path);
 502 
 503     // Check if recursive.
 504     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 505     jccb(Assembler::equal, push);
 506 
 507     // Try to lock. Transition lock bits 0b01 => 0b00
 508     movptr(rax_reg, mark);
 509     orptr(rax_reg, markWord::unlocked_value);
 510     andptr(mark, ~(int32_t)markWord::unlocked_value);
 511     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 512     jcc(Assembler::notEqual, slow_path);
 513 
 514     if (UseObjectMonitorTable) {
 515       // Need to reload top, clobbered by CAS.
 516       movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 517     }
 518     bind(push);
 519     // After successful lock, push object on lock-stack.
 520     movptr(Address(thread, top), obj);
 521     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 522     jmpb(locked);
 523   }
 524 
 525   { // Handle inflated monitor.
 526     bind(inflated);
 527 
 528     const Register monitor = t;
 529 
 530     if (!UseObjectMonitorTable) {
 531       assert(mark == monitor, "should be the same here");
 532     } else {
 533       // Uses ObjectMonitorTable.  Look for the monitor in the om_cache.
 534       // Fetch ObjectMonitor* from the cache or take the slow-path.
 535       Label monitor_found;
 536 
 537       // Load cache address
 538       lea(t, Address(thread, JavaThread::om_cache_oops_offset()));
 539 
 540       const int num_unrolled = 2;
 541       for (int i = 0; i < num_unrolled; i++) {
 542         cmpptr(obj, Address(t));
 543         jccb(Assembler::equal, monitor_found);
 544         increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 545       }
 546 
 547       Label loop;
 548 
 549       // Search for obj in cache.
 550       bind(loop);
 551 
 552       // Check for match.
 553       cmpptr(obj, Address(t));
 554       jccb(Assembler::equal, monitor_found);
 555 
 556       // Search until null encountered, guaranteed _null_sentinel at end.
 557       cmpptr(Address(t), 1);
 558       jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0
 559       increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 560       jmpb(loop);
 561 
 562       // Cache hit.
 563       bind(monitor_found);
 564       movptr(monitor, Address(t, OMCache::oop_to_monitor_difference()));
 565     }
 566     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 567     const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 568     const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
 569 
 570     Label monitor_locked;
 571     // Lock the monitor.
 572 
 573     if (UseObjectMonitorTable) {
 574       // Cache the monitor for unlock before trashing box. On failure to acquire
 575       // the lock, the slow path will reset the entry accordingly (see CacheSetter).
 576       movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
 577     }
 578 
 579     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 580     xorptr(rax_reg, rax_reg);
 581     movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
 582     lock(); cmpxchgptr(box, owner_address);
 583     jccb(Assembler::equal, monitor_locked);
 584 
 585     // Check if recursive.
 586     cmpptr(box, rax_reg);
 587     jccb(Assembler::notEqual, slow_path);
 588 
 589     // Recursive.
 590     increment(recursions_address);
 591 
 592     bind(monitor_locked);
 593   }
 594 
 595   bind(locked);
 596   // Set ZF = 1
 597   xorl(rax_reg, rax_reg);
 598 
 599 #ifdef ASSERT
 600   // Check that locked label is reached with ZF set.
 601   Label zf_correct;
 602   Label zf_bad_zero;
 603   jcc(Assembler::zero, zf_correct);
 604   jmp(zf_bad_zero);
 605 #endif
 606 
 607   bind(slow_path);
 608 #ifdef ASSERT
 609   // Check that slow_path label is reached with ZF not set.
 610   jcc(Assembler::notZero, zf_correct);
 611   stop("Fast Lock ZF != 0");
 612   bind(zf_bad_zero);
 613   stop("Fast Lock ZF != 1");
 614   bind(zf_correct);
 615 #endif
 616   // C2 uses the value of ZF to determine the continuation.
 617 }
 618 
 619 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) {
 620   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 621   assert(reg_rax == rax, "Used for CAS");
 622   assert_different_registers(obj, reg_rax, t);
 623 
 624   // Handle inflated monitor.
 625   Label inflated, inflated_check_lock_stack;
 626   // Finish fast unlock successfully.  MUST jump with ZF == 1
 627   Label unlocked, slow_path;
 628 
 629   const Register mark = t;
 630   const Register monitor = t;
 631   const Register top = UseObjectMonitorTable ? t : reg_rax;
 632   const Register box = reg_rax;
 633 
 634   Label dummy;
 635   C2FastUnlockLightweightStub* stub = nullptr;
 636 
 637   if (!Compile::current()->output()->in_scratch_emit_size()) {
 638     stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread);
 639     Compile::current()->output()->add_stub(stub);
 640   }
 641 
 642   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 643 
 644   { // Lightweight Unlock
 645 
 646     // Load top.
 647     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 648 
 649     if (!UseObjectMonitorTable) {
 650       // Prefetch mark.
 651       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 652     }
 653 
 654     // Check if obj is top of lock-stack.
 655     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 656     // Top of lock stack was not obj. Must be monitor.
 657     jcc(Assembler::notEqual, inflated_check_lock_stack);
 658 
 659     // Pop lock-stack.
 660     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 661     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 662 
 663     // Check if recursive.
 664     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 665     jcc(Assembler::equal, unlocked);
 666 
 667     // We elide the monitor check, let the CAS fail instead.
 668 
 669     if (UseObjectMonitorTable) {
 670       // Load mark.
 671       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 672     }
 673 
 674     // Try to unlock. Transition lock bits 0b00 => 0b01
 675     movptr(reg_rax, mark);
 676     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
 677     orptr(mark, markWord::unlocked_value);
 678     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 679     jcc(Assembler::notEqual, push_and_slow_path);
 680     jmp(unlocked);
 681   }
 682 
 683 
 684   { // Handle inflated monitor.
 685     bind(inflated_check_lock_stack);
 686 #ifdef ASSERT
 687     Label check_done;
 688     subl(top, oopSize);
 689     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
 690     jcc(Assembler::below, check_done);
 691     cmpptr(obj, Address(thread, top));
 692     jccb(Assembler::notEqual, inflated_check_lock_stack);
 693     stop("Fast Unlock lock on stack");
 694     bind(check_done);
 695     if (UseObjectMonitorTable) {
 696       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 697     }
 698     testptr(mark, markWord::monitor_value);
 699     jccb(Assembler::notZero, inflated);
 700     stop("Fast Unlock not monitor");
 701 #endif
 702 
 703     bind(inflated);
 704 
 705     if (!UseObjectMonitorTable) {
 706       assert(mark == monitor, "should be the same here");
 707     } else {
 708       // Uses ObjectMonitorTable.  Look for the monitor in our BasicLock on the stack.
 709       movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 710       // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
 711       cmpptr(monitor, alignof(ObjectMonitor*));
 712       jcc(Assembler::below, slow_path);
 713     }
 714     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 715     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
 716     const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
 717     const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag};
 718     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
 719 
 720     Label recursive;
 721 
 722     // Check if recursive.
 723     cmpptr(recursions_address, 0);
 724     jccb(Assembler::notZero, recursive);
 725 
 726     // Set owner to null.
 727     // Release to satisfy the JMM
 728     movptr(owner_address, NULL_WORD);
 729     // We need a full fence after clearing owner to avoid stranding.
 730     // StoreLoad achieves this.
 731     membar(StoreLoad);
 732 
 733     // Check if the entry_list is empty.
 734     cmpptr(entry_list_address, NULL_WORD);
 735     jccb(Assembler::zero, unlocked);    // If so we are done.
 736 
 737     // Check if there is a successor.
 738     cmpptr(succ_address, NULL_WORD);
 739     jccb(Assembler::notZero, unlocked); // If so we are done.
 740 
 741     // Save the monitor pointer in the current thread, so we can try to
 742     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 743     if (!UseObjectMonitorTable) {
 744       andptr(monitor, ~(int32_t)markWord::monitor_value);
 745     }
 746     movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
 747 
 748     orl(t, 1); // Fast Unlock ZF = 0
 749     jmpb(slow_path);
 750 
 751     // Recursive unlock.
 752     bind(recursive);
 753     decrement(recursions_address);
 754   }
 755 
 756   bind(unlocked);
 757   xorl(t, t); // Fast Unlock ZF = 1
 758 
 759 #ifdef ASSERT
 760   // Check that unlocked label is reached with ZF set.
 761   Label zf_correct;
 762   Label zf_bad_zero;
 763   jcc(Assembler::zero, zf_correct);
 764   jmp(zf_bad_zero);
 765 #endif
 766 
 767   bind(slow_path);
 768   if (stub != nullptr) {
 769     bind(stub->slow_path_continuation());
 770   }
 771 #ifdef ASSERT
 772   // Check that stub->continuation() label is reached with ZF not set.
 773   jcc(Assembler::notZero, zf_correct);
 774   stop("Fast Unlock ZF != 0");
 775   bind(zf_bad_zero);
 776   stop("Fast Unlock ZF != 1");
 777   bind(zf_correct);
 778 #endif
 779   // C2 uses the value of ZF to determine the continuation.
 780 }
 781 
 782 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
 783   fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
 784 }
 785 
 786 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) {
 787   const int framesize = Compile::current()->output()->frame_size_in_bytes();
 788   masm->movptr(dst, rsp);
 789   if (framesize > 2 * wordSize) {
 790     masm->addptr(dst, framesize - 2 * wordSize);
 791   }
 792 }
 793 
 794 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
 795   if (PreserveFramePointer) {
 796     // frame pointer is valid
 797 #ifdef ASSERT
 798     // Verify frame pointer value in rbp.
 799     reconstruct_frame_pointer_helper(this, rtmp);
 800     Label L_success;
 801     cmpq(rbp, rtmp);
 802     jccb(Assembler::equal, L_success);
 803     STOP("frame pointer mismatch");
 804     bind(L_success);
 805 #endif // ASSERT
 806   } else {
 807     reconstruct_frame_pointer_helper(this, rbp);
 808   }
 809 }
 810 
 811 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) {
 812   jint lo = t->_lo;
 813   jint hi = t->_hi;
 814   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi);
 815   if (t == TypeInt::INT) {
 816     return;
 817   }
 818 
 819   BLOCK_COMMENT("CastII {");
 820   Label fail;
 821   Label succeed;
 822   if (hi == max_jint) {
 823     cmpl(val, lo);
 824     jccb(Assembler::greaterEqual, succeed);
 825   } else {
 826     if (lo != min_jint) {
 827       cmpl(val, lo);
 828       jccb(Assembler::less, fail);
 829     }
 830     cmpl(val, hi);
 831     jccb(Assembler::lessEqual, succeed);
 832   }
 833 
 834   bind(fail);
 835   movl(c_rarg0, idx);
 836   movl(c_rarg1, val);
 837   movl(c_rarg2, lo);
 838   movl(c_rarg3, hi);
 839   reconstruct_frame_pointer(rscratch1);
 840   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range)));
 841   hlt();
 842   bind(succeed);
 843   BLOCK_COMMENT("} // CastII");
 844 }
 845 
 846 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
 847   fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
 848 }
 849 
 850 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) {
 851   jlong lo = t->_lo;
 852   jlong hi = t->_hi;
 853   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi);
 854   if (t == TypeLong::LONG) {
 855     return;
 856   }
 857 
 858   BLOCK_COMMENT("CastLL {");
 859   Label fail;
 860   Label succeed;
 861 
 862   auto cmp_val = [&](jlong bound) {
 863     if (is_simm32(bound)) {
 864       cmpq(val, checked_cast<int>(bound));
 865     } else {
 866       mov64(tmp, bound);
 867       cmpq(val, tmp);
 868     }
 869   };
 870 
 871   if (hi == max_jlong) {
 872     cmp_val(lo);
 873     jccb(Assembler::greaterEqual, succeed);
 874   } else {
 875     if (lo != min_jlong) {
 876       cmp_val(lo);
 877       jccb(Assembler::less, fail);
 878     }
 879     cmp_val(hi);
 880     jccb(Assembler::lessEqual, succeed);
 881   }
 882 
 883   bind(fail);
 884   movl(c_rarg0, idx);
 885   movq(c_rarg1, val);
 886   mov64(c_rarg2, lo);
 887   mov64(c_rarg3, hi);
 888   reconstruct_frame_pointer(rscratch1);
 889   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range)));
 890   hlt();
 891   bind(succeed);
 892   BLOCK_COMMENT("} // CastLL");
 893 }
 894 
 895 //-------------------------------------------------------------------------------------------
 896 // Generic instructions support for use in .ad files C2 code generation
 897 
 898 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 899   if (dst != src) {
 900     movdqu(dst, src);
 901   }
 902   if (opcode == Op_AbsVD) {
 903     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 904   } else {
 905     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 906     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 907   }
 908 }
 909 
 910 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 911   if (opcode == Op_AbsVD) {
 912     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 913   } else {
 914     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 915     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 916   }
 917 }
 918 
 919 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 920   if (dst != src) {
 921     movdqu(dst, src);
 922   }
 923   if (opcode == Op_AbsVF) {
 924     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 925   } else {
 926     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 927     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 928   }
 929 }
 930 
 931 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 932   if (opcode == Op_AbsVF) {
 933     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 934   } else {
 935     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 936     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 937   }
 938 }
 939 
 940 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 941   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 942   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 943 
 944   if (opcode == Op_MinV) {
 945     if (elem_bt == T_BYTE) {
 946       pminsb(dst, src);
 947     } else if (elem_bt == T_SHORT) {
 948       pminsw(dst, src);
 949     } else if (elem_bt == T_INT) {
 950       pminsd(dst, src);
 951     } else {
 952       assert(elem_bt == T_LONG, "required");
 953       assert(tmp == xmm0, "required");
 954       assert_different_registers(dst, src, tmp);
 955       movdqu(xmm0, dst);
 956       pcmpgtq(xmm0, src);
 957       blendvpd(dst, src);  // xmm0 as mask
 958     }
 959   } else { // opcode == Op_MaxV
 960     if (elem_bt == T_BYTE) {
 961       pmaxsb(dst, src);
 962     } else if (elem_bt == T_SHORT) {
 963       pmaxsw(dst, src);
 964     } else if (elem_bt == T_INT) {
 965       pmaxsd(dst, src);
 966     } else {
 967       assert(elem_bt == T_LONG, "required");
 968       assert(tmp == xmm0, "required");
 969       assert_different_registers(dst, src, tmp);
 970       movdqu(xmm0, src);
 971       pcmpgtq(xmm0, dst);
 972       blendvpd(dst, src);  // xmm0 as mask
 973     }
 974   }
 975 }
 976 
 977 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 978                                   XMMRegister src1, Address src2, int vlen_enc) {
 979   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 980   if (opcode == Op_UMinV) {
 981     switch(elem_bt) {
 982       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 983       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 984       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 985       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 986       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 987     }
 988   } else {
 989     assert(opcode == Op_UMaxV, "required");
 990     switch(elem_bt) {
 991       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 992       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 993       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 994       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 995       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 996     }
 997   }
 998 }
 999 
1000 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
1001   // For optimality, leverage a full vector width of 512 bits
1002   // for operations over smaller vector sizes on AVX512 targets.
1003   if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
1004     if (opcode == Op_UMaxV) {
1005       evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
1006     } else {
1007       assert(opcode == Op_UMinV, "required");
1008       evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
1009     }
1010   } else {
1011     // T1 = -1
1012     vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
1013     // T1 = -1 << 63
1014     vpsllq(xtmp1, xtmp1, 63, vlen_enc);
1015     // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
1016     vpaddq(xtmp2, xtmp1, src2, vlen_enc);
1017     // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
1018     vpaddq(xtmp1, xtmp1, src1, vlen_enc);
1019     // Mask = T2 > T1
1020     vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
1021     if (opcode == Op_UMaxV) {
1022       // Res = Mask ? Src2 : Src1
1023       vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
1024     } else {
1025       // Res = Mask ? Src1 : Src2
1026       vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
1027     }
1028   }
1029 }
1030 
1031 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
1032                                   XMMRegister src1, XMMRegister src2, int vlen_enc) {
1033   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
1034   if (opcode == Op_UMinV) {
1035     switch(elem_bt) {
1036       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
1037       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
1038       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
1039       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
1040       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
1041     }
1042   } else {
1043     assert(opcode == Op_UMaxV, "required");
1044     switch(elem_bt) {
1045       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
1046       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
1047       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
1048       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
1049       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
1050     }
1051   }
1052 }
1053 
1054 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
1055                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
1056                                  int vlen_enc) {
1057   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1058 
1059   if (opcode == Op_MinV) {
1060     if (elem_bt == T_BYTE) {
1061       vpminsb(dst, src1, src2, vlen_enc);
1062     } else if (elem_bt == T_SHORT) {
1063       vpminsw(dst, src1, src2, vlen_enc);
1064     } else if (elem_bt == T_INT) {
1065       vpminsd(dst, src1, src2, vlen_enc);
1066     } else {
1067       assert(elem_bt == T_LONG, "required");
1068       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1069         vpminsq(dst, src1, src2, vlen_enc);
1070       } else {
1071         assert_different_registers(dst, src1, src2);
1072         vpcmpgtq(dst, src1, src2, vlen_enc);
1073         vblendvpd(dst, src1, src2, dst, vlen_enc);
1074       }
1075     }
1076   } else { // opcode == Op_MaxV
1077     if (elem_bt == T_BYTE) {
1078       vpmaxsb(dst, src1, src2, vlen_enc);
1079     } else if (elem_bt == T_SHORT) {
1080       vpmaxsw(dst, src1, src2, vlen_enc);
1081     } else if (elem_bt == T_INT) {
1082       vpmaxsd(dst, src1, src2, vlen_enc);
1083     } else {
1084       assert(elem_bt == T_LONG, "required");
1085       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1086         vpmaxsq(dst, src1, src2, vlen_enc);
1087       } else {
1088         assert_different_registers(dst, src1, src2);
1089         vpcmpgtq(dst, src1, src2, vlen_enc);
1090         vblendvpd(dst, src2, src1, dst, vlen_enc);
1091       }
1092     }
1093   }
1094 }
1095 
1096 // Float/Double min max
1097 
1098 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1099                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1100                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1101                                    int vlen_enc) {
1102   assert(UseAVX > 0, "required");
1103   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1104          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1105   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1106   assert_different_registers(a, tmp, atmp, btmp);
1107   assert_different_registers(b, tmp, atmp, btmp);
1108 
1109   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1110   bool is_double_word = is_double_word_type(elem_bt);
1111 
1112   /* Note on 'non-obvious' assembly sequence:
1113    *
1114    * While there are vminps/vmaxps instructions, there are two important differences between hardware
1115    * and Java on how they handle floats:
1116    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
1117    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
1118    *
1119    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
1120    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
1121    *                (only useful when signs differ, noop otherwise)
1122    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
1123 
1124    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
1125    *   btmp = (b < +0.0) ? a : b
1126    *   atmp = (b < +0.0) ? b : a
1127    *   Tmp  = Max_Float(atmp , btmp)
1128    *   Res  = (atmp == NaN) ? atmp : Tmp
1129    */
1130 
1131   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
1132   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
1133   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
1134   XMMRegister mask;
1135 
1136   if (!is_double_word && is_min) {
1137     mask = a;
1138     vblend = &MacroAssembler::vblendvps;
1139     vmaxmin = &MacroAssembler::vminps;
1140     vcmp = &MacroAssembler::vcmpps;
1141   } else if (!is_double_word && !is_min) {
1142     mask = b;
1143     vblend = &MacroAssembler::vblendvps;
1144     vmaxmin = &MacroAssembler::vmaxps;
1145     vcmp = &MacroAssembler::vcmpps;
1146   } else if (is_double_word && is_min) {
1147     mask = a;
1148     vblend = &MacroAssembler::vblendvpd;
1149     vmaxmin = &MacroAssembler::vminpd;
1150     vcmp = &MacroAssembler::vcmppd;
1151   } else {
1152     assert(is_double_word && !is_min, "sanity");
1153     mask = b;
1154     vblend = &MacroAssembler::vblendvpd;
1155     vmaxmin = &MacroAssembler::vmaxpd;
1156     vcmp = &MacroAssembler::vcmppd;
1157   }
1158 
1159   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
1160   XMMRegister maxmin, scratch;
1161   if (dst == btmp) {
1162     maxmin = btmp;
1163     scratch = tmp;
1164   } else {
1165     maxmin = tmp;
1166     scratch = btmp;
1167   }
1168 
1169   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1170   if (precompute_mask && !is_double_word) {
1171     vpsrad(tmp, mask, 32, vlen_enc);
1172     mask = tmp;
1173   } else if (precompute_mask && is_double_word) {
1174     vpxor(tmp, tmp, tmp, vlen_enc);
1175     vpcmpgtq(tmp, tmp, mask, vlen_enc);
1176     mask = tmp;
1177   }
1178 
1179   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1180   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1181   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1182   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1183   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1184 }
1185 
1186 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1187                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1188                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1189                                     int vlen_enc) {
1190   assert(UseAVX > 2, "required");
1191   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1192          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1193   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1194   assert_different_registers(dst, a, atmp, btmp);
1195   assert_different_registers(dst, b, atmp, btmp);
1196 
1197   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1198   bool is_double_word = is_double_word_type(elem_bt);
1199   bool merge = true;
1200 
1201   if (!is_double_word && is_min) {
1202     evpmovd2m(ktmp, a, vlen_enc);
1203     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1204     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1205     vminps(dst, atmp, btmp, vlen_enc);
1206     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1207     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1208   } else if (!is_double_word && !is_min) {
1209     evpmovd2m(ktmp, b, vlen_enc);
1210     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1211     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1212     vmaxps(dst, atmp, btmp, vlen_enc);
1213     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1214     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1215   } else if (is_double_word && is_min) {
1216     evpmovq2m(ktmp, a, vlen_enc);
1217     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1218     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1219     vminpd(dst, atmp, btmp, vlen_enc);
1220     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1221     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1222   } else {
1223     assert(is_double_word && !is_min, "sanity");
1224     evpmovq2m(ktmp, b, vlen_enc);
1225     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1226     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1227     vmaxpd(dst, atmp, btmp, vlen_enc);
1228     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1229     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1230   }
1231 }
1232 
1233 void C2_MacroAssembler::vminmax_fp(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1234                                    XMMRegister src1, XMMRegister src2, int vlen_enc) {
1235   assert(opc == Op_MinV || opc == Op_MinReductionV ||
1236          opc == Op_MaxV || opc == Op_MaxReductionV, "sanity");
1237 
1238   int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_MINMAX_MIN_COMPARE_SIGN
1239                                                          : AVX10_MINMAX_MAX_COMPARE_SIGN;
1240   if (elem_bt == T_FLOAT) {
1241     evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc);
1242   } else {
1243     assert(elem_bt == T_DOUBLE, "");
1244     evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc);
1245   }
1246 }
1247 
1248 // Float/Double signum
1249 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1250   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1251 
1252   Label DONE_LABEL;
1253 
1254   if (opcode == Op_SignumF) {
1255     ucomiss(dst, zero);
1256     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1257     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1258     movflt(dst, one);
1259     jcc(Assembler::above, DONE_LABEL);
1260     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1261   } else if (opcode == Op_SignumD) {
1262     ucomisd(dst, zero);
1263     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1264     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1265     movdbl(dst, one);
1266     jcc(Assembler::above, DONE_LABEL);
1267     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1268   }
1269 
1270   bind(DONE_LABEL);
1271 }
1272 
1273 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1274   if (sign) {
1275     pmovsxbw(dst, src);
1276   } else {
1277     pmovzxbw(dst, src);
1278   }
1279 }
1280 
1281 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1282   if (sign) {
1283     vpmovsxbw(dst, src, vector_len);
1284   } else {
1285     vpmovzxbw(dst, src, vector_len);
1286   }
1287 }
1288 
1289 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1290   if (sign) {
1291     vpmovsxbd(dst, src, vector_len);
1292   } else {
1293     vpmovzxbd(dst, src, vector_len);
1294   }
1295 }
1296 
1297 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1298   if (sign) {
1299     vpmovsxwd(dst, src, vector_len);
1300   } else {
1301     vpmovzxwd(dst, src, vector_len);
1302   }
1303 }
1304 
1305 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1306                                      int shift, int vector_len) {
1307   if (opcode == Op_RotateLeftV) {
1308     if (etype == T_INT) {
1309       evprold(dst, src, shift, vector_len);
1310     } else {
1311       assert(etype == T_LONG, "expected type T_LONG");
1312       evprolq(dst, src, shift, vector_len);
1313     }
1314   } else {
1315     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1316     if (etype == T_INT) {
1317       evprord(dst, src, shift, vector_len);
1318     } else {
1319       assert(etype == T_LONG, "expected type T_LONG");
1320       evprorq(dst, src, shift, vector_len);
1321     }
1322   }
1323 }
1324 
1325 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1326                                      XMMRegister shift, int vector_len) {
1327   if (opcode == Op_RotateLeftV) {
1328     if (etype == T_INT) {
1329       evprolvd(dst, src, shift, vector_len);
1330     } else {
1331       assert(etype == T_LONG, "expected type T_LONG");
1332       evprolvq(dst, src, shift, vector_len);
1333     }
1334   } else {
1335     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1336     if (etype == T_INT) {
1337       evprorvd(dst, src, shift, vector_len);
1338     } else {
1339       assert(etype == T_LONG, "expected type T_LONG");
1340       evprorvq(dst, src, shift, vector_len);
1341     }
1342   }
1343 }
1344 
1345 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1346   if (opcode == Op_RShiftVI) {
1347     psrad(dst, shift);
1348   } else if (opcode == Op_LShiftVI) {
1349     pslld(dst, shift);
1350   } else {
1351     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1352     psrld(dst, shift);
1353   }
1354 }
1355 
1356 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1357   switch (opcode) {
1358     case Op_RShiftVI:  psrad(dst, shift); break;
1359     case Op_LShiftVI:  pslld(dst, shift); break;
1360     case Op_URShiftVI: psrld(dst, shift); break;
1361 
1362     default: assert(false, "%s", NodeClassNames[opcode]);
1363   }
1364 }
1365 
1366 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1367   if (opcode == Op_RShiftVI) {
1368     vpsrad(dst, nds, shift, vector_len);
1369   } else if (opcode == Op_LShiftVI) {
1370     vpslld(dst, nds, shift, vector_len);
1371   } else {
1372     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1373     vpsrld(dst, nds, shift, vector_len);
1374   }
1375 }
1376 
1377 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1378   switch (opcode) {
1379     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1380     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1381     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1382 
1383     default: assert(false, "%s", NodeClassNames[opcode]);
1384   }
1385 }
1386 
1387 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1388   switch (opcode) {
1389     case Op_RShiftVB:  // fall-through
1390     case Op_RShiftVS:  psraw(dst, shift); break;
1391 
1392     case Op_LShiftVB:  // fall-through
1393     case Op_LShiftVS:  psllw(dst, shift);   break;
1394 
1395     case Op_URShiftVS: // fall-through
1396     case Op_URShiftVB: psrlw(dst, shift);  break;
1397 
1398     default: assert(false, "%s", NodeClassNames[opcode]);
1399   }
1400 }
1401 
1402 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1403   switch (opcode) {
1404     case Op_RShiftVB:  // fall-through
1405     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1406 
1407     case Op_LShiftVB:  // fall-through
1408     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1409 
1410     case Op_URShiftVS: // fall-through
1411     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1412 
1413     default: assert(false, "%s", NodeClassNames[opcode]);
1414   }
1415 }
1416 
1417 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1418   switch (opcode) {
1419     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1420     case Op_LShiftVL:  psllq(dst, shift); break;
1421     case Op_URShiftVL: psrlq(dst, shift); break;
1422 
1423     default: assert(false, "%s", NodeClassNames[opcode]);
1424   }
1425 }
1426 
1427 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1428   if (opcode == Op_RShiftVL) {
1429     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1430   } else if (opcode == Op_LShiftVL) {
1431     psllq(dst, shift);
1432   } else {
1433     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1434     psrlq(dst, shift);
1435   }
1436 }
1437 
1438 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1439   switch (opcode) {
1440     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1441     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1442     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1443 
1444     default: assert(false, "%s", NodeClassNames[opcode]);
1445   }
1446 }
1447 
1448 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1449   if (opcode == Op_RShiftVL) {
1450     evpsraq(dst, nds, shift, vector_len);
1451   } else if (opcode == Op_LShiftVL) {
1452     vpsllq(dst, nds, shift, vector_len);
1453   } else {
1454     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1455     vpsrlq(dst, nds, shift, vector_len);
1456   }
1457 }
1458 
1459 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1460   switch (opcode) {
1461     case Op_RShiftVB:  // fall-through
1462     case Op_RShiftVS:  // fall-through
1463     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1464 
1465     case Op_LShiftVB:  // fall-through
1466     case Op_LShiftVS:  // fall-through
1467     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1468 
1469     case Op_URShiftVB: // fall-through
1470     case Op_URShiftVS: // fall-through
1471     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1472 
1473     default: assert(false, "%s", NodeClassNames[opcode]);
1474   }
1475 }
1476 
1477 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1478   switch (opcode) {
1479     case Op_RShiftVB:  // fall-through
1480     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1481 
1482     case Op_LShiftVB:  // fall-through
1483     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1484 
1485     case Op_URShiftVB: // fall-through
1486     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1487 
1488     default: assert(false, "%s", NodeClassNames[opcode]);
1489   }
1490 }
1491 
1492 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1493   assert(UseAVX >= 2, "required");
1494   switch (opcode) {
1495     case Op_RShiftVL: {
1496       if (UseAVX > 2) {
1497         assert(tmp == xnoreg, "not used");
1498         if (!VM_Version::supports_avx512vl()) {
1499           vlen_enc = Assembler::AVX_512bit;
1500         }
1501         evpsravq(dst, src, shift, vlen_enc);
1502       } else {
1503         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1504         vpsrlvq(dst, src, shift, vlen_enc);
1505         vpsrlvq(tmp, tmp, shift, vlen_enc);
1506         vpxor(dst, dst, tmp, vlen_enc);
1507         vpsubq(dst, dst, tmp, vlen_enc);
1508       }
1509       break;
1510     }
1511     case Op_LShiftVL: {
1512       assert(tmp == xnoreg, "not used");
1513       vpsllvq(dst, src, shift, vlen_enc);
1514       break;
1515     }
1516     case Op_URShiftVL: {
1517       assert(tmp == xnoreg, "not used");
1518       vpsrlvq(dst, src, shift, vlen_enc);
1519       break;
1520     }
1521     default: assert(false, "%s", NodeClassNames[opcode]);
1522   }
1523 }
1524 
1525 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1526 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1527   assert(opcode == Op_LShiftVB ||
1528          opcode == Op_RShiftVB ||
1529          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1530   bool sign = (opcode != Op_URShiftVB);
1531   assert(vector_len == 0, "required");
1532   vextendbd(sign, dst, src, 1);
1533   vpmovzxbd(vtmp, shift, 1);
1534   varshiftd(opcode, dst, dst, vtmp, 1);
1535   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1536   vextracti128_high(vtmp, dst);
1537   vpackusdw(dst, dst, vtmp, 0);
1538 }
1539 
1540 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1541 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1542   assert(opcode == Op_LShiftVB ||
1543          opcode == Op_RShiftVB ||
1544          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1545   bool sign = (opcode != Op_URShiftVB);
1546   int ext_vector_len = vector_len + 1;
1547   vextendbw(sign, dst, src, ext_vector_len);
1548   vpmovzxbw(vtmp, shift, ext_vector_len);
1549   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1550   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1551   if (vector_len == 0) {
1552     vextracti128_high(vtmp, dst);
1553     vpackuswb(dst, dst, vtmp, vector_len);
1554   } else {
1555     vextracti64x4_high(vtmp, dst);
1556     vpackuswb(dst, dst, vtmp, vector_len);
1557     vpermq(dst, dst, 0xD8, vector_len);
1558   }
1559 }
1560 
1561 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1562   switch(typ) {
1563     case T_BYTE:
1564       pinsrb(dst, val, idx);
1565       break;
1566     case T_SHORT:
1567       pinsrw(dst, val, idx);
1568       break;
1569     case T_INT:
1570       pinsrd(dst, val, idx);
1571       break;
1572     case T_LONG:
1573       pinsrq(dst, val, idx);
1574       break;
1575     default:
1576       assert(false,"Should not reach here.");
1577       break;
1578   }
1579 }
1580 
1581 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1582   switch(typ) {
1583     case T_BYTE:
1584       vpinsrb(dst, src, val, idx);
1585       break;
1586     case T_SHORT:
1587       vpinsrw(dst, src, val, idx);
1588       break;
1589     case T_INT:
1590       vpinsrd(dst, src, val, idx);
1591       break;
1592     case T_LONG:
1593       vpinsrq(dst, src, val, idx);
1594       break;
1595     default:
1596       assert(false,"Should not reach here.");
1597       break;
1598   }
1599 }
1600 
1601 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt,
1602                                                 XMMRegister dst, Register base,
1603                                                 Register idx_base,
1604                                                 Register offset, Register mask,
1605                                                 Register mask_idx, Register rtmp,
1606                                                 int vlen_enc) {
1607   vpxor(dst, dst, dst, vlen_enc);
1608   if (elem_bt == T_SHORT) {
1609     for (int i = 0; i < 4; i++) {
1610       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1611       Label skip_load;
1612       btq(mask, mask_idx);
1613       jccb(Assembler::carryClear, skip_load);
1614       movl(rtmp, Address(idx_base, i * 4));
1615       if (offset != noreg) {
1616         addl(rtmp, offset);
1617       }
1618       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1619       bind(skip_load);
1620       incq(mask_idx);
1621     }
1622   } else {
1623     assert(elem_bt == T_BYTE, "");
1624     for (int i = 0; i < 8; i++) {
1625       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1626       Label skip_load;
1627       btq(mask, mask_idx);
1628       jccb(Assembler::carryClear, skip_load);
1629       movl(rtmp, Address(idx_base, i * 4));
1630       if (offset != noreg) {
1631         addl(rtmp, offset);
1632       }
1633       pinsrb(dst, Address(base, rtmp), i);
1634       bind(skip_load);
1635       incq(mask_idx);
1636     }
1637   }
1638 }
1639 
1640 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst,
1641                                          Register base, Register idx_base,
1642                                          Register offset, Register rtmp,
1643                                          int vlen_enc) {
1644   vpxor(dst, dst, dst, vlen_enc);
1645   if (elem_bt == T_SHORT) {
1646     for (int i = 0; i < 4; i++) {
1647       // dst[i] = src[offset + idx_base[i]]
1648       movl(rtmp, Address(idx_base, i * 4));
1649       if (offset != noreg) {
1650         addl(rtmp, offset);
1651       }
1652       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1653     }
1654   } else {
1655     assert(elem_bt == T_BYTE, "");
1656     for (int i = 0; i < 8; i++) {
1657       // dst[i] = src[offset + idx_base[i]]
1658       movl(rtmp, Address(idx_base, i * 4));
1659       if (offset != noreg) {
1660         addl(rtmp, offset);
1661       }
1662       pinsrb(dst, Address(base, rtmp), i);
1663     }
1664   }
1665 }
1666 
1667 /*
1668  * Gather using hybrid algorithm, first partially unroll scalar loop
1669  * to accumulate values from gather indices into a quad-word(64bit) slice.
1670  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1671  * permutation to place the slice into appropriate vector lane
1672  * locations in destination vector. Following pseudo code describes the
1673  * algorithm in detail:
1674  *
1675  * DST_VEC = ZERO_VEC
1676  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1677  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1678  * FOREACH_ITER:
1679  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1680  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1681  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1682  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1683  *
1684  * With each iteration, doubleword permute indices (0,1) corresponding
1685  * to gathered quadword gets right shifted by two lane positions.
1686  *
1687  */
1688 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1689                                         Register base, Register idx_base,
1690                                         Register offset, Register mask,
1691                                         XMMRegister xtmp1, XMMRegister xtmp2,
1692                                         XMMRegister temp_dst, Register rtmp,
1693                                         Register mask_idx, Register length,
1694                                         int vector_len, int vlen_enc) {
1695   Label GATHER8_LOOP;
1696   assert(is_subword_type(elem_ty), "");
1697   movl(length, vector_len);
1698   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1699   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1700   vallones(xtmp2, vlen_enc);
1701   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1702   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1703   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1704 
1705   bind(GATHER8_LOOP);
1706     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1707     if (mask == noreg) {
1708       vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc);
1709     } else {
1710       vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc);
1711     }
1712     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1713     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1714     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1715     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1716     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1717     vpor(dst, dst, temp_dst, vlen_enc);
1718     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1719     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1720     jcc(Assembler::notEqual, GATHER8_LOOP);
1721 }
1722 
1723 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1724   switch(typ) {
1725     case T_INT:
1726       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1727       break;
1728     case T_FLOAT:
1729       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1730       break;
1731     case T_LONG:
1732       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1733       break;
1734     case T_DOUBLE:
1735       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1736       break;
1737     default:
1738       assert(false,"Should not reach here.");
1739       break;
1740   }
1741 }
1742 
1743 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1744   switch(typ) {
1745     case T_INT:
1746       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1747       break;
1748     case T_FLOAT:
1749       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1750       break;
1751     case T_LONG:
1752       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1753       break;
1754     case T_DOUBLE:
1755       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1756       break;
1757     default:
1758       assert(false,"Should not reach here.");
1759       break;
1760   }
1761 }
1762 
1763 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1764   switch(typ) {
1765     case T_INT:
1766       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1767       break;
1768     case T_FLOAT:
1769       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1770       break;
1771     case T_LONG:
1772       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1773       break;
1774     case T_DOUBLE:
1775       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1776       break;
1777     default:
1778       assert(false,"Should not reach here.");
1779       break;
1780   }
1781 }
1782 
1783 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1784   if (vlen_in_bytes <= 16) {
1785     pxor (dst, dst);
1786     psubb(dst, src);
1787     switch (elem_bt) {
1788       case T_BYTE:   /* nothing to do */ break;
1789       case T_SHORT:  pmovsxbw(dst, dst); break;
1790       case T_INT:    pmovsxbd(dst, dst); break;
1791       case T_FLOAT:  pmovsxbd(dst, dst); break;
1792       case T_LONG:   pmovsxbq(dst, dst); break;
1793       case T_DOUBLE: pmovsxbq(dst, dst); break;
1794 
1795       default: assert(false, "%s", type2name(elem_bt));
1796     }
1797   } else {
1798     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1799     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1800 
1801     vpxor (dst, dst, dst, vlen_enc);
1802     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1803 
1804     switch (elem_bt) {
1805       case T_BYTE:   /* nothing to do */            break;
1806       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1807       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1808       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1809       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1810       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1811 
1812       default: assert(false, "%s", type2name(elem_bt));
1813     }
1814   }
1815 }
1816 
1817 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1818   if (novlbwdq) {
1819     vpmovsxbd(xtmp, src, vlen_enc);
1820     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1821             Assembler::eq, true, vlen_enc, noreg);
1822   } else {
1823     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1824     vpsubb(xtmp, xtmp, src, vlen_enc);
1825     evpmovb2m(dst, xtmp, vlen_enc);
1826   }
1827 }
1828 
1829 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1830   if (is_integral_type(bt)) {
1831     switch (vlen_in_bytes) {
1832       case 4:  movdl(dst, src);   break;
1833       case 8:  movq(dst, src);    break;
1834       case 16: movdqu(dst, src);  break;
1835       case 32: vmovdqu(dst, src); break;
1836       case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1837       default: ShouldNotReachHere();
1838     }
1839   } else {
1840     switch (vlen_in_bytes) {
1841       case 4:  movflt(dst, src); break;
1842       case 8:  movdbl(dst, src); break;
1843       case 16: movups(dst, src); break;
1844       case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1845       case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1846       default: ShouldNotReachHere();
1847     }
1848   }
1849 }
1850 
1851 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1852   assert(rscratch != noreg || always_reachable(src), "missing");
1853 
1854   if (reachable(src)) {
1855     load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1856   } else {
1857     lea(rscratch, src);
1858     load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1859   }
1860 }
1861 
1862 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1863   int vlen_enc = vector_length_encoding(vlen);
1864   if (VM_Version::supports_avx()) {
1865     if (bt == T_LONG) {
1866       if (VM_Version::supports_avx2()) {
1867         vpbroadcastq(dst, src, vlen_enc);
1868       } else {
1869         vmovddup(dst, src, vlen_enc);
1870       }
1871     } else if (bt == T_DOUBLE) {
1872       if (vlen_enc != Assembler::AVX_128bit) {
1873         vbroadcastsd(dst, src, vlen_enc, noreg);
1874       } else {
1875         vmovddup(dst, src, vlen_enc);
1876       }
1877     } else {
1878       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1879         vpbroadcastd(dst, src, vlen_enc);
1880       } else {
1881         vbroadcastss(dst, src, vlen_enc);
1882       }
1883     }
1884   } else if (VM_Version::supports_sse3()) {
1885     movddup(dst, src);
1886   } else {
1887     load_vector(bt, dst, src, vlen);
1888   }
1889 }
1890 
1891 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1892   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1893   int offset = exact_log2(type2aelembytes(bt)) << 6;
1894   if (is_floating_point_type(bt)) {
1895     offset += 128;
1896   }
1897   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1898   load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1899 }
1900 
1901 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1902 
1903 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1904   int vector_len = Assembler::AVX_128bit;
1905 
1906   switch (opcode) {
1907     case Op_AndReductionV:  pand(dst, src); break;
1908     case Op_OrReductionV:   por (dst, src); break;
1909     case Op_XorReductionV:  pxor(dst, src); break;
1910     case Op_MinReductionV:
1911       switch (typ) {
1912         case T_BYTE:        pminsb(dst, src); break;
1913         case T_SHORT:       pminsw(dst, src); break;
1914         case T_INT:         pminsd(dst, src); break;
1915         case T_LONG:        assert(UseAVX > 2, "required");
1916                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1917         default:            assert(false, "wrong type");
1918       }
1919       break;
1920     case Op_MaxReductionV:
1921       switch (typ) {
1922         case T_BYTE:        pmaxsb(dst, src); break;
1923         case T_SHORT:       pmaxsw(dst, src); break;
1924         case T_INT:         pmaxsd(dst, src); break;
1925         case T_LONG:        assert(UseAVX > 2, "required");
1926                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1927         default:            assert(false, "wrong type");
1928       }
1929       break;
1930     case Op_AddReductionVF: addss(dst, src); break;
1931     case Op_AddReductionVD: addsd(dst, src); break;
1932     case Op_AddReductionVI:
1933       switch (typ) {
1934         case T_BYTE:        paddb(dst, src); break;
1935         case T_SHORT:       paddw(dst, src); break;
1936         case T_INT:         paddd(dst, src); break;
1937         default:            assert(false, "wrong type");
1938       }
1939       break;
1940     case Op_AddReductionVL: paddq(dst, src); break;
1941     case Op_MulReductionVF: mulss(dst, src); break;
1942     case Op_MulReductionVD: mulsd(dst, src); break;
1943     case Op_MulReductionVI:
1944       switch (typ) {
1945         case T_SHORT:       pmullw(dst, src); break;
1946         case T_INT:         pmulld(dst, src); break;
1947         default:            assert(false, "wrong type");
1948       }
1949       break;
1950     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1951                             evpmullq(dst, dst, src, vector_len); break;
1952     default:                assert(false, "wrong opcode");
1953   }
1954 }
1955 
1956 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1957   switch (opcode) {
1958     case Op_AddReductionVF: addps(dst, src); break;
1959     case Op_AddReductionVD: addpd(dst, src); break;
1960     case Op_MulReductionVF: mulps(dst, src); break;
1961     case Op_MulReductionVD: mulpd(dst, src); break;
1962     default:                assert(false, "%s", NodeClassNames[opcode]);
1963   }
1964 }
1965 
1966 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1967   int vector_len = Assembler::AVX_256bit;
1968 
1969   switch (opcode) {
1970     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1971     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1972     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1973     case Op_MinReductionV:
1974       switch (typ) {
1975         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1976         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1977         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1978         case T_LONG:        assert(UseAVX > 2, "required");
1979                             vpminsq(dst, src1, src2, vector_len); break;
1980         default:            assert(false, "wrong type");
1981       }
1982       break;
1983     case Op_MaxReductionV:
1984       switch (typ) {
1985         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1986         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1987         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1988         case T_LONG:        assert(UseAVX > 2, "required");
1989                             vpmaxsq(dst, src1, src2, vector_len); break;
1990         default:            assert(false, "wrong type");
1991       }
1992       break;
1993     case Op_AddReductionVI:
1994       switch (typ) {
1995         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1996         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1997         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1998         default:            assert(false, "wrong type");
1999       }
2000       break;
2001     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
2002     case Op_MulReductionVI:
2003       switch (typ) {
2004         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
2005         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
2006         default:            assert(false, "wrong type");
2007       }
2008       break;
2009     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
2010     default:                assert(false, "wrong opcode");
2011   }
2012 }
2013 
2014 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
2015   int vector_len = Assembler::AVX_256bit;
2016 
2017   switch (opcode) {
2018     case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
2019     case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
2020     case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
2021     case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
2022     default:                assert(false, "%s", NodeClassNames[opcode]);
2023   }
2024 }
2025 
2026 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
2027                                   XMMRegister dst, XMMRegister src,
2028                                   XMMRegister vtmp1, XMMRegister vtmp2) {
2029   switch (opcode) {
2030     case Op_AddReductionVF:
2031     case Op_MulReductionVF:
2032       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
2033       break;
2034 
2035     case Op_AddReductionVD:
2036     case Op_MulReductionVD:
2037       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
2038       break;
2039 
2040     default: assert(false, "wrong opcode");
2041   }
2042 }
2043 
2044 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
2045                                             XMMRegister dst, XMMRegister src,
2046                                             XMMRegister vtmp1, XMMRegister vtmp2) {
2047   switch (opcode) {
2048     case Op_AddReductionVF:
2049     case Op_MulReductionVF:
2050       unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
2051       break;
2052 
2053     case Op_AddReductionVD:
2054     case Op_MulReductionVD:
2055       unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
2056       break;
2057 
2058     default: assert(false, "%s", NodeClassNames[opcode]);
2059   }
2060 }
2061 
2062 void C2_MacroAssembler::reduceB(int opcode, int vlen,
2063                              Register dst, Register src1, XMMRegister src2,
2064                              XMMRegister vtmp1, XMMRegister vtmp2) {
2065   switch (vlen) {
2066     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2067     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2068     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2069     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2070 
2071     default: assert(false, "wrong vector length");
2072   }
2073 }
2074 
2075 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
2076                              Register dst, Register src1, XMMRegister src2,
2077                              XMMRegister vtmp1, XMMRegister vtmp2) {
2078   switch (vlen) {
2079     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2080     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2081     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2082     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2083 
2084     default: assert(false, "wrong vector length");
2085   }
2086 }
2087 
2088 void C2_MacroAssembler::reduceS(int opcode, int vlen,
2089                              Register dst, Register src1, XMMRegister src2,
2090                              XMMRegister vtmp1, XMMRegister vtmp2) {
2091   switch (vlen) {
2092     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2093     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2094     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2095     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2096 
2097     default: assert(false, "wrong vector length");
2098   }
2099 }
2100 
2101 void C2_MacroAssembler::reduceI(int opcode, int vlen,
2102                              Register dst, Register src1, XMMRegister src2,
2103                              XMMRegister vtmp1, XMMRegister vtmp2) {
2104   switch (vlen) {
2105     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2106     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2107     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2108     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2109 
2110     default: assert(false, "wrong vector length");
2111   }
2112 }
2113 
2114 void C2_MacroAssembler::reduceL(int opcode, int vlen,
2115                              Register dst, Register src1, XMMRegister src2,
2116                              XMMRegister vtmp1, XMMRegister vtmp2) {
2117   switch (vlen) {
2118     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2119     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2120     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2121 
2122     default: assert(false, "wrong vector length");
2123   }
2124 }
2125 
2126 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2127   switch (vlen) {
2128     case 2:
2129       assert(vtmp2 == xnoreg, "");
2130       reduce2F(opcode, dst, src, vtmp1);
2131       break;
2132     case 4:
2133       assert(vtmp2 == xnoreg, "");
2134       reduce4F(opcode, dst, src, vtmp1);
2135       break;
2136     case 8:
2137       reduce8F(opcode, dst, src, vtmp1, vtmp2);
2138       break;
2139     case 16:
2140       reduce16F(opcode, dst, src, vtmp1, vtmp2);
2141       break;
2142     default: assert(false, "wrong vector length");
2143   }
2144 }
2145 
2146 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2147   switch (vlen) {
2148     case 2:
2149       assert(vtmp2 == xnoreg, "");
2150       reduce2D(opcode, dst, src, vtmp1);
2151       break;
2152     case 4:
2153       reduce4D(opcode, dst, src, vtmp1, vtmp2);
2154       break;
2155     case 8:
2156       reduce8D(opcode, dst, src, vtmp1, vtmp2);
2157       break;
2158     default: assert(false, "wrong vector length");
2159   }
2160 }
2161 
2162 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2163   switch (vlen) {
2164     case 2:
2165       assert(vtmp1 == xnoreg, "");
2166       assert(vtmp2 == xnoreg, "");
2167       unorderedReduce2F(opcode, dst, src);
2168       break;
2169     case 4:
2170       assert(vtmp2 == xnoreg, "");
2171       unorderedReduce4F(opcode, dst, src, vtmp1);
2172       break;
2173     case 8:
2174       unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
2175       break;
2176     case 16:
2177       unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
2178       break;
2179     default: assert(false, "wrong vector length");
2180   }
2181 }
2182 
2183 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2184   switch (vlen) {
2185     case 2:
2186       assert(vtmp1 == xnoreg, "");
2187       assert(vtmp2 == xnoreg, "");
2188       unorderedReduce2D(opcode, dst, src);
2189       break;
2190     case 4:
2191       assert(vtmp2 == xnoreg, "");
2192       unorderedReduce4D(opcode, dst, src, vtmp1);
2193       break;
2194     case 8:
2195       unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2196       break;
2197     default: assert(false, "wrong vector length");
2198   }
2199 }
2200 
2201 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2202   if (opcode == Op_AddReductionVI) {
2203     if (vtmp1 != src2) {
2204       movdqu(vtmp1, src2);
2205     }
2206     phaddd(vtmp1, vtmp1);
2207   } else {
2208     pshufd(vtmp1, src2, 0x1);
2209     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2210   }
2211   movdl(vtmp2, src1);
2212   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2213   movdl(dst, vtmp1);
2214 }
2215 
2216 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2217   if (opcode == Op_AddReductionVI) {
2218     if (vtmp1 != src2) {
2219       movdqu(vtmp1, src2);
2220     }
2221     phaddd(vtmp1, src2);
2222     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2223   } else {
2224     pshufd(vtmp2, src2, 0xE);
2225     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2226     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2227   }
2228 }
2229 
2230 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2231   if (opcode == Op_AddReductionVI) {
2232     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2233     vextracti128_high(vtmp2, vtmp1);
2234     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2235     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2236   } else {
2237     vextracti128_high(vtmp1, src2);
2238     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2239     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2240   }
2241 }
2242 
2243 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2244   vextracti64x4_high(vtmp2, src2);
2245   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2246   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2247 }
2248 
2249 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2250   pshufd(vtmp2, src2, 0x1);
2251   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2252   movdqu(vtmp1, vtmp2);
2253   psrldq(vtmp1, 2);
2254   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2255   movdqu(vtmp2, vtmp1);
2256   psrldq(vtmp2, 1);
2257   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2258   movdl(vtmp2, src1);
2259   pmovsxbd(vtmp1, vtmp1);
2260   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2261   pextrb(dst, vtmp1, 0x0);
2262   movsbl(dst, dst);
2263 }
2264 
2265 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2266   pshufd(vtmp1, src2, 0xE);
2267   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2268   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2269 }
2270 
2271 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2272   vextracti128_high(vtmp2, src2);
2273   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2274   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2275 }
2276 
2277 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2278   vextracti64x4_high(vtmp1, src2);
2279   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2280   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2281 }
2282 
2283 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2284   pmovsxbw(vtmp2, src2);
2285   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2286 }
2287 
2288 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2289   if (UseAVX > 1) {
2290     int vector_len = Assembler::AVX_256bit;
2291     vpmovsxbw(vtmp1, src2, vector_len);
2292     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2293   } else {
2294     pmovsxbw(vtmp2, src2);
2295     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2296     pshufd(vtmp2, src2, 0x1);
2297     pmovsxbw(vtmp2, src2);
2298     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2299   }
2300 }
2301 
2302 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2303   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2304     int vector_len = Assembler::AVX_512bit;
2305     vpmovsxbw(vtmp1, src2, vector_len);
2306     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2307   } else {
2308     assert(UseAVX >= 2,"Should not reach here.");
2309     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2310     vextracti128_high(vtmp2, src2);
2311     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2312   }
2313 }
2314 
2315 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2316   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2317   vextracti64x4_high(vtmp2, src2);
2318   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2319 }
2320 
2321 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2322   if (opcode == Op_AddReductionVI) {
2323     if (vtmp1 != src2) {
2324       movdqu(vtmp1, src2);
2325     }
2326     phaddw(vtmp1, vtmp1);
2327     phaddw(vtmp1, vtmp1);
2328   } else {
2329     pshufd(vtmp2, src2, 0x1);
2330     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2331     movdqu(vtmp1, vtmp2);
2332     psrldq(vtmp1, 2);
2333     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2334   }
2335   movdl(vtmp2, src1);
2336   pmovsxwd(vtmp1, vtmp1);
2337   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2338   pextrw(dst, vtmp1, 0x0);
2339   movswl(dst, dst);
2340 }
2341 
2342 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2343   if (opcode == Op_AddReductionVI) {
2344     if (vtmp1 != src2) {
2345       movdqu(vtmp1, src2);
2346     }
2347     phaddw(vtmp1, src2);
2348   } else {
2349     pshufd(vtmp1, src2, 0xE);
2350     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2351   }
2352   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2353 }
2354 
2355 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2356   if (opcode == Op_AddReductionVI) {
2357     int vector_len = Assembler::AVX_256bit;
2358     vphaddw(vtmp2, src2, src2, vector_len);
2359     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2360   } else {
2361     vextracti128_high(vtmp2, src2);
2362     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2363   }
2364   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2365 }
2366 
2367 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2368   int vector_len = Assembler::AVX_256bit;
2369   vextracti64x4_high(vtmp1, src2);
2370   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2371   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2372 }
2373 
2374 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2375   pshufd(vtmp2, src2, 0xE);
2376   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2377   movdq(vtmp1, src1);
2378   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2379   movdq(dst, vtmp1);
2380 }
2381 
2382 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2383   vextracti128_high(vtmp1, src2);
2384   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2385   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2386 }
2387 
2388 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2389   vextracti64x4_high(vtmp2, src2);
2390   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2391   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2392 }
2393 
2394 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2395   mov64(temp, -1L);
2396   bzhiq(temp, temp, len);
2397   kmovql(dst, temp);
2398 }
2399 
2400 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2401   reduce_operation_128(T_FLOAT, opcode, dst, src);
2402   pshufd(vtmp, src, 0x1);
2403   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2404 }
2405 
2406 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2407   reduce2F(opcode, dst, src, vtmp);
2408   pshufd(vtmp, src, 0x2);
2409   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2410   pshufd(vtmp, src, 0x3);
2411   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2412 }
2413 
2414 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2415   reduce4F(opcode, dst, src, vtmp2);
2416   vextractf128_high(vtmp2, src);
2417   reduce4F(opcode, dst, vtmp2, vtmp1);
2418 }
2419 
2420 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2421   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2422   vextracti64x4_high(vtmp1, src);
2423   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2424 }
2425 
2426 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2427   pshufd(dst, src, 0x1);
2428   reduce_operation_128(T_FLOAT, opcode, dst, src);
2429 }
2430 
2431 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2432   pshufd(vtmp, src, 0xE);
2433   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2434   unorderedReduce2F(opcode, dst, vtmp);
2435 }
2436 
2437 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2438   vextractf128_high(vtmp1, src);
2439   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2440   unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2441 }
2442 
2443 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2444   vextractf64x4_high(vtmp2, src);
2445   unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2446   unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2447 }
2448 
2449 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2450   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2451   pshufd(vtmp, src, 0xE);
2452   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2453 }
2454 
2455 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2456   reduce2D(opcode, dst, src, vtmp2);
2457   vextractf128_high(vtmp2, src);
2458   reduce2D(opcode, dst, vtmp2, vtmp1);
2459 }
2460 
2461 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2462   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2463   vextracti64x4_high(vtmp1, src);
2464   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2465 }
2466 
2467 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2468   pshufd(dst, src, 0xE);
2469   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2470 }
2471 
2472 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2473   vextractf128_high(vtmp, src);
2474   unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2475   unorderedReduce2D(opcode, dst, vtmp);
2476 }
2477 
2478 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2479   vextractf64x4_high(vtmp2, src);
2480   unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2481   unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2482 }
2483 
2484 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2485   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2486 }
2487 
2488 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2489   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2490 }
2491 
2492 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2493   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2494 }
2495 
2496 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2497                                  int vec_enc) {
2498   switch(elem_bt) {
2499     case T_INT:
2500     case T_FLOAT:
2501       vmaskmovps(dst, src, mask, vec_enc);
2502       break;
2503     case T_LONG:
2504     case T_DOUBLE:
2505       vmaskmovpd(dst, src, mask, vec_enc);
2506       break;
2507     default:
2508       fatal("Unsupported type %s", type2name(elem_bt));
2509       break;
2510   }
2511 }
2512 
2513 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2514                                  int vec_enc) {
2515   switch(elem_bt) {
2516     case T_INT:
2517     case T_FLOAT:
2518       vmaskmovps(dst, src, mask, vec_enc);
2519       break;
2520     case T_LONG:
2521     case T_DOUBLE:
2522       vmaskmovpd(dst, src, mask, vec_enc);
2523       break;
2524     default:
2525       fatal("Unsupported type %s", type2name(elem_bt));
2526       break;
2527   }
2528 }
2529 
2530 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2531                                           XMMRegister dst, XMMRegister src,
2532                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2533                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2534   const int permconst[] = {1, 14};
2535   XMMRegister wsrc = src;
2536   XMMRegister wdst = xmm_0;
2537   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2538 
2539   int vlen_enc = Assembler::AVX_128bit;
2540   if (vlen == 16) {
2541     vlen_enc = Assembler::AVX_256bit;
2542   }
2543 
2544   for (int i = log2(vlen) - 1; i >=0; i--) {
2545     if (i == 0 && !is_dst_valid) {
2546       wdst = dst;
2547     }
2548     if (i == 3) {
2549       vextracti64x4_high(wtmp, wsrc);
2550     } else if (i == 2) {
2551       vextracti128_high(wtmp, wsrc);
2552     } else { // i = [0,1]
2553       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2554     }
2555 
2556     if (VM_Version::supports_avx10_2()) {
2557       vminmax_fp(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc);
2558     } else {
2559       vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2560     }
2561     wsrc = wdst;
2562     vlen_enc = Assembler::AVX_128bit;
2563   }
2564   if (is_dst_valid) {
2565     if (VM_Version::supports_avx10_2()) {
2566       vminmax_fp(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit);
2567     } else {
2568       vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2569     }
2570   }
2571 }
2572 
2573 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2574                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2575                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2576   XMMRegister wsrc = src;
2577   XMMRegister wdst = xmm_0;
2578   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2579   int vlen_enc = Assembler::AVX_128bit;
2580   if (vlen == 8) {
2581     vlen_enc = Assembler::AVX_256bit;
2582   }
2583   for (int i = log2(vlen) - 1; i >=0; i--) {
2584     if (i == 0 && !is_dst_valid) {
2585       wdst = dst;
2586     }
2587     if (i == 1) {
2588       vextracti128_high(wtmp, wsrc);
2589     } else if (i == 2) {
2590       vextracti64x4_high(wtmp, wsrc);
2591     } else {
2592       assert(i == 0, "%d", i);
2593       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2594     }
2595 
2596     if (VM_Version::supports_avx10_2()) {
2597       vminmax_fp(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc);
2598     } else {
2599       vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2600     }
2601 
2602     wsrc = wdst;
2603     vlen_enc = Assembler::AVX_128bit;
2604   }
2605 
2606   if (is_dst_valid) {
2607     if (VM_Version::supports_avx10_2()) {
2608       vminmax_fp(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit);
2609     } else {
2610       vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2611     }
2612   }
2613 }
2614 
2615 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2616   switch (bt) {
2617     case T_BYTE:  pextrb(dst, src, idx); break;
2618     case T_SHORT: pextrw(dst, src, idx); break;
2619     case T_INT:   pextrd(dst, src, idx); break;
2620     case T_LONG:  pextrq(dst, src, idx); break;
2621 
2622     default:
2623       assert(false,"Should not reach here.");
2624       break;
2625   }
2626 }
2627 
2628 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2629   int esize =  type2aelembytes(typ);
2630   int elem_per_lane = 16/esize;
2631   int lane = elemindex / elem_per_lane;
2632   int eindex = elemindex % elem_per_lane;
2633 
2634   if (lane >= 2) {
2635     assert(UseAVX > 2, "required");
2636     vextractf32x4(dst, src, lane & 3);
2637     return dst;
2638   } else if (lane > 0) {
2639     assert(UseAVX > 0, "required");
2640     vextractf128(dst, src, lane);
2641     return dst;
2642   } else {
2643     return src;
2644   }
2645 }
2646 
2647 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2648   if (typ == T_BYTE) {
2649     movsbl(dst, dst);
2650   } else if (typ == T_SHORT) {
2651     movswl(dst, dst);
2652   }
2653 }
2654 
2655 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2656   int esize =  type2aelembytes(typ);
2657   int elem_per_lane = 16/esize;
2658   int eindex = elemindex % elem_per_lane;
2659   assert(is_integral_type(typ),"required");
2660 
2661   if (eindex == 0) {
2662     if (typ == T_LONG) {
2663       movq(dst, src);
2664     } else {
2665       movdl(dst, src);
2666       movsxl(typ, dst);
2667     }
2668   } else {
2669     extract(typ, dst, src, eindex);
2670     movsxl(typ, dst);
2671   }
2672 }
2673 
2674 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2675   int esize =  type2aelembytes(typ);
2676   int elem_per_lane = 16/esize;
2677   int eindex = elemindex % elem_per_lane;
2678   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2679 
2680   if (eindex == 0) {
2681     movq(dst, src);
2682   } else {
2683     if (typ == T_FLOAT) {
2684       if (UseAVX == 0) {
2685         movdqu(dst, src);
2686         shufps(dst, dst, eindex);
2687       } else {
2688         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2689       }
2690     } else {
2691       if (UseAVX == 0) {
2692         movdqu(dst, src);
2693         psrldq(dst, eindex*esize);
2694       } else {
2695         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2696       }
2697       movq(dst, dst);
2698     }
2699   }
2700   // Zero upper bits
2701   if (typ == T_FLOAT) {
2702     if (UseAVX == 0) {
2703       assert(vtmp != xnoreg, "required.");
2704       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2705       pand(dst, vtmp);
2706     } else {
2707       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2708     }
2709   }
2710 }
2711 
2712 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2713   switch(typ) {
2714     case T_BYTE:
2715     case T_BOOLEAN:
2716       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2717       break;
2718     case T_SHORT:
2719     case T_CHAR:
2720       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2721       break;
2722     case T_INT:
2723     case T_FLOAT:
2724       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2725       break;
2726     case T_LONG:
2727     case T_DOUBLE:
2728       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2729       break;
2730     default:
2731       assert(false,"Should not reach here.");
2732       break;
2733   }
2734 }
2735 
2736 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2737   assert(rscratch != noreg || always_reachable(src2), "missing");
2738 
2739   switch(typ) {
2740     case T_BOOLEAN:
2741     case T_BYTE:
2742       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2743       break;
2744     case T_CHAR:
2745     case T_SHORT:
2746       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2747       break;
2748     case T_INT:
2749     case T_FLOAT:
2750       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2751       break;
2752     case T_LONG:
2753     case T_DOUBLE:
2754       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2755       break;
2756     default:
2757       assert(false,"Should not reach here.");
2758       break;
2759   }
2760 }
2761 
2762 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2763   switch(typ) {
2764     case T_BYTE:
2765       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2766       break;
2767     case T_SHORT:
2768       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2769       break;
2770     case T_INT:
2771     case T_FLOAT:
2772       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2773       break;
2774     case T_LONG:
2775     case T_DOUBLE:
2776       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2777       break;
2778     default:
2779       assert(false,"Should not reach here.");
2780       break;
2781   }
2782 }
2783 
2784 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2785   assert(vlen_in_bytes <= 32, "");
2786   int esize = type2aelembytes(bt);
2787   if (vlen_in_bytes == 32) {
2788     assert(vtmp == xnoreg, "required.");
2789     if (esize >= 4) {
2790       vtestps(src1, src2, AVX_256bit);
2791     } else {
2792       vptest(src1, src2, AVX_256bit);
2793     }
2794     return;
2795   }
2796   if (vlen_in_bytes < 16) {
2797     // Duplicate the lower part to fill the whole register,
2798     // Don't need to do so for src2
2799     assert(vtmp != xnoreg, "required");
2800     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2801     pshufd(vtmp, src1, shuffle_imm);
2802   } else {
2803     assert(vtmp == xnoreg, "required");
2804     vtmp = src1;
2805   }
2806   if (esize >= 4 && VM_Version::supports_avx()) {
2807     vtestps(vtmp, src2, AVX_128bit);
2808   } else {
2809     ptest(vtmp, src2);
2810   }
2811 }
2812 
2813 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2814 #ifdef ASSERT
2815   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2816   bool is_bw_supported = VM_Version::supports_avx512bw();
2817   if (is_bw && !is_bw_supported) {
2818     assert(vlen_enc != Assembler::AVX_512bit, "required");
2819     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2820            "XMM register should be 0-15");
2821   }
2822 #endif // ASSERT
2823   switch (elem_bt) {
2824     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2825     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2826     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2827     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2828     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2829     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2830     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2831   }
2832 }
2833 
2834 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2835   assert(UseAVX >= 2, "required");
2836   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2837   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2838   if ((UseAVX > 2) &&
2839       (!is_bw || VM_Version::supports_avx512bw()) &&
2840       (!is_vl || VM_Version::supports_avx512vl())) {
2841     switch (elem_bt) {
2842       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2843       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2844       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2845       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2846       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2847     }
2848   } else {
2849     assert(vlen_enc != Assembler::AVX_512bit, "required");
2850     assert((dst->encoding() < 16),"XMM register should be 0-15");
2851     switch (elem_bt) {
2852       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2853       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2854       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2855       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2856       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2857       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2858       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2859     }
2860   }
2861 }
2862 
2863 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2864   switch (to_elem_bt) {
2865     case T_SHORT:
2866       vpmovsxbw(dst, src, vlen_enc);
2867       break;
2868     case T_INT:
2869       vpmovsxbd(dst, src, vlen_enc);
2870       break;
2871     case T_FLOAT:
2872       vpmovsxbd(dst, src, vlen_enc);
2873       vcvtdq2ps(dst, dst, vlen_enc);
2874       break;
2875     case T_LONG:
2876       vpmovsxbq(dst, src, vlen_enc);
2877       break;
2878     case T_DOUBLE: {
2879       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2880       vpmovsxbd(dst, src, mid_vlen_enc);
2881       vcvtdq2pd(dst, dst, vlen_enc);
2882       break;
2883     }
2884     default:
2885       fatal("Unsupported type %s", type2name(to_elem_bt));
2886       break;
2887   }
2888 }
2889 
2890 //-------------------------------------------------------------------------------------------
2891 
2892 // IndexOf for constant substrings with size >= 8 chars
2893 // which don't need to be loaded through stack.
2894 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2895                                          Register cnt1, Register cnt2,
2896                                          int int_cnt2,  Register result,
2897                                          XMMRegister vec, Register tmp,
2898                                          int ae) {
2899   ShortBranchVerifier sbv(this);
2900   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2901   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2902 
2903   // This method uses the pcmpestri instruction with bound registers
2904   //   inputs:
2905   //     xmm - substring
2906   //     rax - substring length (elements count)
2907   //     mem - scanned string
2908   //     rdx - string length (elements count)
2909   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2910   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2911   //   outputs:
2912   //     rcx - matched index in string
2913   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2914   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2915   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2916   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2917   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2918 
2919   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2920         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2921         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2922 
2923   // Note, inline_string_indexOf() generates checks:
2924   // if (substr.count > string.count) return -1;
2925   // if (substr.count == 0) return 0;
2926   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2927 
2928   // Load substring.
2929   if (ae == StrIntrinsicNode::UL) {
2930     pmovzxbw(vec, Address(str2, 0));
2931   } else {
2932     movdqu(vec, Address(str2, 0));
2933   }
2934   movl(cnt2, int_cnt2);
2935   movptr(result, str1); // string addr
2936 
2937   if (int_cnt2 > stride) {
2938     jmpb(SCAN_TO_SUBSTR);
2939 
2940     // Reload substr for rescan, this code
2941     // is executed only for large substrings (> 8 chars)
2942     bind(RELOAD_SUBSTR);
2943     if (ae == StrIntrinsicNode::UL) {
2944       pmovzxbw(vec, Address(str2, 0));
2945     } else {
2946       movdqu(vec, Address(str2, 0));
2947     }
2948     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2949 
2950     bind(RELOAD_STR);
2951     // We came here after the beginning of the substring was
2952     // matched but the rest of it was not so we need to search
2953     // again. Start from the next element after the previous match.
2954 
2955     // cnt2 is number of substring reminding elements and
2956     // cnt1 is number of string reminding elements when cmp failed.
2957     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2958     subl(cnt1, cnt2);
2959     addl(cnt1, int_cnt2);
2960     movl(cnt2, int_cnt2); // Now restore cnt2
2961 
2962     decrementl(cnt1);     // Shift to next element
2963     cmpl(cnt1, cnt2);
2964     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2965 
2966     addptr(result, (1<<scale1));
2967 
2968   } // (int_cnt2 > 8)
2969 
2970   // Scan string for start of substr in 16-byte vectors
2971   bind(SCAN_TO_SUBSTR);
2972   pcmpestri(vec, Address(result, 0), mode);
2973   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2974   subl(cnt1, stride);
2975   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2976   cmpl(cnt1, cnt2);
2977   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2978   addptr(result, 16);
2979   jmpb(SCAN_TO_SUBSTR);
2980 
2981   // Found a potential substr
2982   bind(FOUND_CANDIDATE);
2983   // Matched whole vector if first element matched (tmp(rcx) == 0).
2984   if (int_cnt2 == stride) {
2985     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2986   } else { // int_cnt2 > 8
2987     jccb(Assembler::overflow, FOUND_SUBSTR);
2988   }
2989   // After pcmpestri tmp(rcx) contains matched element index
2990   // Compute start addr of substr
2991   lea(result, Address(result, tmp, scale1));
2992 
2993   // Make sure string is still long enough
2994   subl(cnt1, tmp);
2995   cmpl(cnt1, cnt2);
2996   if (int_cnt2 == stride) {
2997     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2998   } else { // int_cnt2 > 8
2999     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
3000   }
3001   // Left less then substring.
3002 
3003   bind(RET_NOT_FOUND);
3004   movl(result, -1);
3005   jmp(EXIT);
3006 
3007   if (int_cnt2 > stride) {
3008     // This code is optimized for the case when whole substring
3009     // is matched if its head is matched.
3010     bind(MATCH_SUBSTR_HEAD);
3011     pcmpestri(vec, Address(result, 0), mode);
3012     // Reload only string if does not match
3013     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
3014 
3015     Label CONT_SCAN_SUBSTR;
3016     // Compare the rest of substring (> 8 chars).
3017     bind(FOUND_SUBSTR);
3018     // First 8 chars are already matched.
3019     negptr(cnt2);
3020     addptr(cnt2, stride);
3021 
3022     bind(SCAN_SUBSTR);
3023     subl(cnt1, stride);
3024     cmpl(cnt2, -stride); // Do not read beyond substring
3025     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
3026     // Back-up strings to avoid reading beyond substring:
3027     // cnt1 = cnt1 - cnt2 + 8
3028     addl(cnt1, cnt2); // cnt2 is negative
3029     addl(cnt1, stride);
3030     movl(cnt2, stride); negptr(cnt2);
3031     bind(CONT_SCAN_SUBSTR);
3032     if (int_cnt2 < (int)G) {
3033       int tail_off1 = int_cnt2<<scale1;
3034       int tail_off2 = int_cnt2<<scale2;
3035       if (ae == StrIntrinsicNode::UL) {
3036         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
3037       } else {
3038         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
3039       }
3040       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
3041     } else {
3042       // calculate index in register to avoid integer overflow (int_cnt2*2)
3043       movl(tmp, int_cnt2);
3044       addptr(tmp, cnt2);
3045       if (ae == StrIntrinsicNode::UL) {
3046         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
3047       } else {
3048         movdqu(vec, Address(str2, tmp, scale2, 0));
3049       }
3050       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
3051     }
3052     // Need to reload strings pointers if not matched whole vector
3053     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3054     addptr(cnt2, stride);
3055     jcc(Assembler::negative, SCAN_SUBSTR);
3056     // Fall through if found full substring
3057 
3058   } // (int_cnt2 > 8)
3059 
3060   bind(RET_FOUND);
3061   // Found result if we matched full small substring.
3062   // Compute substr offset
3063   subptr(result, str1);
3064   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3065     shrl(result, 1); // index
3066   }
3067   bind(EXIT);
3068 
3069 } // string_indexofC8
3070 
3071 // Small strings are loaded through stack if they cross page boundary.
3072 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
3073                                        Register cnt1, Register cnt2,
3074                                        int int_cnt2,  Register result,
3075                                        XMMRegister vec, Register tmp,
3076                                        int ae) {
3077   ShortBranchVerifier sbv(this);
3078   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3079   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
3080 
3081   //
3082   // int_cnt2 is length of small (< 8 chars) constant substring
3083   // or (-1) for non constant substring in which case its length
3084   // is in cnt2 register.
3085   //
3086   // Note, inline_string_indexOf() generates checks:
3087   // if (substr.count > string.count) return -1;
3088   // if (substr.count == 0) return 0;
3089   //
3090   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
3091   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
3092   // This method uses the pcmpestri instruction with bound registers
3093   //   inputs:
3094   //     xmm - substring
3095   //     rax - substring length (elements count)
3096   //     mem - scanned string
3097   //     rdx - string length (elements count)
3098   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
3099   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
3100   //   outputs:
3101   //     rcx - matched index in string
3102   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3103   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
3104   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
3105   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
3106 
3107   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
3108         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
3109         FOUND_CANDIDATE;
3110 
3111   { //========================================================
3112     // We don't know where these strings are located
3113     // and we can't read beyond them. Load them through stack.
3114     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
3115 
3116     movptr(tmp, rsp); // save old SP
3117 
3118     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
3119       if (int_cnt2 == (1>>scale2)) { // One byte
3120         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
3121         load_unsigned_byte(result, Address(str2, 0));
3122         movdl(vec, result); // move 32 bits
3123       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
3124         // Not enough header space in 32-bit VM: 12+3 = 15.
3125         movl(result, Address(str2, -1));
3126         shrl(result, 8);
3127         movdl(vec, result); // move 32 bits
3128       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
3129         load_unsigned_short(result, Address(str2, 0));
3130         movdl(vec, result); // move 32 bits
3131       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
3132         movdl(vec, Address(str2, 0)); // move 32 bits
3133       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
3134         movq(vec, Address(str2, 0));  // move 64 bits
3135       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
3136         // Array header size is 12 bytes in 32-bit VM
3137         // + 6 bytes for 3 chars == 18 bytes,
3138         // enough space to load vec and shift.
3139         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
3140         if (ae == StrIntrinsicNode::UL) {
3141           int tail_off = int_cnt2-8;
3142           pmovzxbw(vec, Address(str2, tail_off));
3143           psrldq(vec, -2*tail_off);
3144         }
3145         else {
3146           int tail_off = int_cnt2*(1<<scale2);
3147           movdqu(vec, Address(str2, tail_off-16));
3148           psrldq(vec, 16-tail_off);
3149         }
3150       }
3151     } else { // not constant substring
3152       cmpl(cnt2, stride);
3153       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3154 
3155       // We can read beyond string if srt+16 does not cross page boundary
3156       // since heaps are aligned and mapped by pages.
3157       assert(os::vm_page_size() < (int)G, "default page should be small");
3158       movl(result, str2); // We need only low 32 bits
3159       andl(result, ((int)os::vm_page_size()-1));
3160       cmpl(result, ((int)os::vm_page_size()-16));
3161       jccb(Assembler::belowEqual, CHECK_STR);
3162 
3163       // Move small strings to stack to allow load 16 bytes into vec.
3164       subptr(rsp, 16);
3165       int stk_offset = wordSize-(1<<scale2);
3166       push(cnt2);
3167 
3168       bind(COPY_SUBSTR);
3169       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3170         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3171         movb(Address(rsp, cnt2, scale2, stk_offset), result);
3172       } else if (ae == StrIntrinsicNode::UU) {
3173         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3174         movw(Address(rsp, cnt2, scale2, stk_offset), result);
3175       }
3176       decrement(cnt2);
3177       jccb(Assembler::notZero, COPY_SUBSTR);
3178 
3179       pop(cnt2);
3180       movptr(str2, rsp);  // New substring address
3181     } // non constant
3182 
3183     bind(CHECK_STR);
3184     cmpl(cnt1, stride);
3185     jccb(Assembler::aboveEqual, BIG_STRINGS);
3186 
3187     // Check cross page boundary.
3188     movl(result, str1); // We need only low 32 bits
3189     andl(result, ((int)os::vm_page_size()-1));
3190     cmpl(result, ((int)os::vm_page_size()-16));
3191     jccb(Assembler::belowEqual, BIG_STRINGS);
3192 
3193     subptr(rsp, 16);
3194     int stk_offset = -(1<<scale1);
3195     if (int_cnt2 < 0) { // not constant
3196       push(cnt2);
3197       stk_offset += wordSize;
3198     }
3199     movl(cnt2, cnt1);
3200 
3201     bind(COPY_STR);
3202     if (ae == StrIntrinsicNode::LL) {
3203       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3204       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3205     } else {
3206       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3207       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3208     }
3209     decrement(cnt2);
3210     jccb(Assembler::notZero, COPY_STR);
3211 
3212     if (int_cnt2 < 0) { // not constant
3213       pop(cnt2);
3214     }
3215     movptr(str1, rsp);  // New string address
3216 
3217     bind(BIG_STRINGS);
3218     // Load substring.
3219     if (int_cnt2 < 0) { // -1
3220       if (ae == StrIntrinsicNode::UL) {
3221         pmovzxbw(vec, Address(str2, 0));
3222       } else {
3223         movdqu(vec, Address(str2, 0));
3224       }
3225       push(cnt2);       // substr count
3226       push(str2);       // substr addr
3227       push(str1);       // string addr
3228     } else {
3229       // Small (< 8 chars) constant substrings are loaded already.
3230       movl(cnt2, int_cnt2);
3231     }
3232     push(tmp);  // original SP
3233 
3234   } // Finished loading
3235 
3236   //========================================================
3237   // Start search
3238   //
3239 
3240   movptr(result, str1); // string addr
3241 
3242   if (int_cnt2  < 0) {  // Only for non constant substring
3243     jmpb(SCAN_TO_SUBSTR);
3244 
3245     // SP saved at sp+0
3246     // String saved at sp+1*wordSize
3247     // Substr saved at sp+2*wordSize
3248     // Substr count saved at sp+3*wordSize
3249 
3250     // Reload substr for rescan, this code
3251     // is executed only for large substrings (> 8 chars)
3252     bind(RELOAD_SUBSTR);
3253     movptr(str2, Address(rsp, 2*wordSize));
3254     movl(cnt2, Address(rsp, 3*wordSize));
3255     if (ae == StrIntrinsicNode::UL) {
3256       pmovzxbw(vec, Address(str2, 0));
3257     } else {
3258       movdqu(vec, Address(str2, 0));
3259     }
3260     // We came here after the beginning of the substring was
3261     // matched but the rest of it was not so we need to search
3262     // again. Start from the next element after the previous match.
3263     subptr(str1, result); // Restore counter
3264     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3265       shrl(str1, 1);
3266     }
3267     addl(cnt1, str1);
3268     decrementl(cnt1);   // Shift to next element
3269     cmpl(cnt1, cnt2);
3270     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3271 
3272     addptr(result, (1<<scale1));
3273   } // non constant
3274 
3275   // Scan string for start of substr in 16-byte vectors
3276   bind(SCAN_TO_SUBSTR);
3277   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3278   pcmpestri(vec, Address(result, 0), mode);
3279   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3280   subl(cnt1, stride);
3281   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3282   cmpl(cnt1, cnt2);
3283   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3284   addptr(result, 16);
3285 
3286   bind(ADJUST_STR);
3287   cmpl(cnt1, stride); // Do not read beyond string
3288   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3289   // Back-up string to avoid reading beyond string.
3290   lea(result, Address(result, cnt1, scale1, -16));
3291   movl(cnt1, stride);
3292   jmpb(SCAN_TO_SUBSTR);
3293 
3294   // Found a potential substr
3295   bind(FOUND_CANDIDATE);
3296   // After pcmpestri tmp(rcx) contains matched element index
3297 
3298   // Make sure string is still long enough
3299   subl(cnt1, tmp);
3300   cmpl(cnt1, cnt2);
3301   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3302   // Left less then substring.
3303 
3304   bind(RET_NOT_FOUND);
3305   movl(result, -1);
3306   jmp(CLEANUP);
3307 
3308   bind(FOUND_SUBSTR);
3309   // Compute start addr of substr
3310   lea(result, Address(result, tmp, scale1));
3311   if (int_cnt2 > 0) { // Constant substring
3312     // Repeat search for small substring (< 8 chars)
3313     // from new point without reloading substring.
3314     // Have to check that we don't read beyond string.
3315     cmpl(tmp, stride-int_cnt2);
3316     jccb(Assembler::greater, ADJUST_STR);
3317     // Fall through if matched whole substring.
3318   } else { // non constant
3319     assert(int_cnt2 == -1, "should be != 0");
3320 
3321     addl(tmp, cnt2);
3322     // Found result if we matched whole substring.
3323     cmpl(tmp, stride);
3324     jcc(Assembler::lessEqual, RET_FOUND);
3325 
3326     // Repeat search for small substring (<= 8 chars)
3327     // from new point 'str1' without reloading substring.
3328     cmpl(cnt2, stride);
3329     // Have to check that we don't read beyond string.
3330     jccb(Assembler::lessEqual, ADJUST_STR);
3331 
3332     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3333     // Compare the rest of substring (> 8 chars).
3334     movptr(str1, result);
3335 
3336     cmpl(tmp, cnt2);
3337     // First 8 chars are already matched.
3338     jccb(Assembler::equal, CHECK_NEXT);
3339 
3340     bind(SCAN_SUBSTR);
3341     pcmpestri(vec, Address(str1, 0), mode);
3342     // Need to reload strings pointers if not matched whole vector
3343     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3344 
3345     bind(CHECK_NEXT);
3346     subl(cnt2, stride);
3347     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3348     addptr(str1, 16);
3349     if (ae == StrIntrinsicNode::UL) {
3350       addptr(str2, 8);
3351     } else {
3352       addptr(str2, 16);
3353     }
3354     subl(cnt1, stride);
3355     cmpl(cnt2, stride); // Do not read beyond substring
3356     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3357     // Back-up strings to avoid reading beyond substring.
3358 
3359     if (ae == StrIntrinsicNode::UL) {
3360       lea(str2, Address(str2, cnt2, scale2, -8));
3361       lea(str1, Address(str1, cnt2, scale1, -16));
3362     } else {
3363       lea(str2, Address(str2, cnt2, scale2, -16));
3364       lea(str1, Address(str1, cnt2, scale1, -16));
3365     }
3366     subl(cnt1, cnt2);
3367     movl(cnt2, stride);
3368     addl(cnt1, stride);
3369     bind(CONT_SCAN_SUBSTR);
3370     if (ae == StrIntrinsicNode::UL) {
3371       pmovzxbw(vec, Address(str2, 0));
3372     } else {
3373       movdqu(vec, Address(str2, 0));
3374     }
3375     jmp(SCAN_SUBSTR);
3376 
3377     bind(RET_FOUND_LONG);
3378     movptr(str1, Address(rsp, wordSize));
3379   } // non constant
3380 
3381   bind(RET_FOUND);
3382   // Compute substr offset
3383   subptr(result, str1);
3384   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3385     shrl(result, 1); // index
3386   }
3387   bind(CLEANUP);
3388   pop(rsp); // restore SP
3389 
3390 } // string_indexof
3391 
3392 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3393                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3394   ShortBranchVerifier sbv(this);
3395   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3396 
3397   int stride = 8;
3398 
3399   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3400         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3401         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3402         FOUND_SEQ_CHAR, DONE_LABEL;
3403 
3404   movptr(result, str1);
3405   if (UseAVX >= 2) {
3406     cmpl(cnt1, stride);
3407     jcc(Assembler::less, SCAN_TO_CHAR);
3408     cmpl(cnt1, 2*stride);
3409     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3410     movdl(vec1, ch);
3411     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3412     vpxor(vec2, vec2);
3413     movl(tmp, cnt1);
3414     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3415     andl(cnt1,0x0000000F);  //tail count (in chars)
3416 
3417     bind(SCAN_TO_16_CHAR_LOOP);
3418     vmovdqu(vec3, Address(result, 0));
3419     vpcmpeqw(vec3, vec3, vec1, 1);
3420     vptest(vec2, vec3);
3421     jcc(Assembler::carryClear, FOUND_CHAR);
3422     addptr(result, 32);
3423     subl(tmp, 2*stride);
3424     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3425     jmp(SCAN_TO_8_CHAR);
3426     bind(SCAN_TO_8_CHAR_INIT);
3427     movdl(vec1, ch);
3428     pshuflw(vec1, vec1, 0x00);
3429     pshufd(vec1, vec1, 0);
3430     pxor(vec2, vec2);
3431   }
3432   bind(SCAN_TO_8_CHAR);
3433   cmpl(cnt1, stride);
3434   jcc(Assembler::less, SCAN_TO_CHAR);
3435   if (UseAVX < 2) {
3436     movdl(vec1, ch);
3437     pshuflw(vec1, vec1, 0x00);
3438     pshufd(vec1, vec1, 0);
3439     pxor(vec2, vec2);
3440   }
3441   movl(tmp, cnt1);
3442   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3443   andl(cnt1,0x00000007);  //tail count (in chars)
3444 
3445   bind(SCAN_TO_8_CHAR_LOOP);
3446   movdqu(vec3, Address(result, 0));
3447   pcmpeqw(vec3, vec1);
3448   ptest(vec2, vec3);
3449   jcc(Assembler::carryClear, FOUND_CHAR);
3450   addptr(result, 16);
3451   subl(tmp, stride);
3452   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3453   bind(SCAN_TO_CHAR);
3454   testl(cnt1, cnt1);
3455   jcc(Assembler::zero, RET_NOT_FOUND);
3456   bind(SCAN_TO_CHAR_LOOP);
3457   load_unsigned_short(tmp, Address(result, 0));
3458   cmpl(ch, tmp);
3459   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3460   addptr(result, 2);
3461   subl(cnt1, 1);
3462   jccb(Assembler::zero, RET_NOT_FOUND);
3463   jmp(SCAN_TO_CHAR_LOOP);
3464 
3465   bind(RET_NOT_FOUND);
3466   movl(result, -1);
3467   jmpb(DONE_LABEL);
3468 
3469   bind(FOUND_CHAR);
3470   if (UseAVX >= 2) {
3471     vpmovmskb(tmp, vec3);
3472   } else {
3473     pmovmskb(tmp, vec3);
3474   }
3475   bsfl(ch, tmp);
3476   addptr(result, ch);
3477 
3478   bind(FOUND_SEQ_CHAR);
3479   subptr(result, str1);
3480   shrl(result, 1);
3481 
3482   bind(DONE_LABEL);
3483 } // string_indexof_char
3484 
3485 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3486                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3487   ShortBranchVerifier sbv(this);
3488   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3489 
3490   int stride = 16;
3491 
3492   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3493         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3494         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3495         FOUND_SEQ_CHAR, DONE_LABEL;
3496 
3497   movptr(result, str1);
3498   if (UseAVX >= 2) {
3499     cmpl(cnt1, stride);
3500     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3501     cmpl(cnt1, stride*2);
3502     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3503     movdl(vec1, ch);
3504     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3505     vpxor(vec2, vec2);
3506     movl(tmp, cnt1);
3507     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3508     andl(cnt1,0x0000001F);  //tail count (in chars)
3509 
3510     bind(SCAN_TO_32_CHAR_LOOP);
3511     vmovdqu(vec3, Address(result, 0));
3512     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3513     vptest(vec2, vec3);
3514     jcc(Assembler::carryClear, FOUND_CHAR);
3515     addptr(result, 32);
3516     subl(tmp, stride*2);
3517     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3518     jmp(SCAN_TO_16_CHAR);
3519 
3520     bind(SCAN_TO_16_CHAR_INIT);
3521     movdl(vec1, ch);
3522     pxor(vec2, vec2);
3523     pshufb(vec1, vec2);
3524   }
3525 
3526   bind(SCAN_TO_16_CHAR);
3527   cmpl(cnt1, stride);
3528   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3529   if (UseAVX < 2) {
3530     movdl(vec1, ch);
3531     pxor(vec2, vec2);
3532     pshufb(vec1, vec2);
3533   }
3534   movl(tmp, cnt1);
3535   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3536   andl(cnt1,0x0000000F);  //tail count (in bytes)
3537 
3538   bind(SCAN_TO_16_CHAR_LOOP);
3539   movdqu(vec3, Address(result, 0));
3540   pcmpeqb(vec3, vec1);
3541   ptest(vec2, vec3);
3542   jcc(Assembler::carryClear, FOUND_CHAR);
3543   addptr(result, 16);
3544   subl(tmp, stride);
3545   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3546 
3547   bind(SCAN_TO_CHAR_INIT);
3548   testl(cnt1, cnt1);
3549   jcc(Assembler::zero, RET_NOT_FOUND);
3550   bind(SCAN_TO_CHAR_LOOP);
3551   load_unsigned_byte(tmp, Address(result, 0));
3552   cmpl(ch, tmp);
3553   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3554   addptr(result, 1);
3555   subl(cnt1, 1);
3556   jccb(Assembler::zero, RET_NOT_FOUND);
3557   jmp(SCAN_TO_CHAR_LOOP);
3558 
3559   bind(RET_NOT_FOUND);
3560   movl(result, -1);
3561   jmpb(DONE_LABEL);
3562 
3563   bind(FOUND_CHAR);
3564   if (UseAVX >= 2) {
3565     vpmovmskb(tmp, vec3);
3566   } else {
3567     pmovmskb(tmp, vec3);
3568   }
3569   bsfl(ch, tmp);
3570   addptr(result, ch);
3571 
3572   bind(FOUND_SEQ_CHAR);
3573   subptr(result, str1);
3574 
3575   bind(DONE_LABEL);
3576 } // stringL_indexof_char
3577 
3578 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3579   switch (eltype) {
3580   case T_BOOLEAN: return sizeof(jboolean);
3581   case T_BYTE:  return sizeof(jbyte);
3582   case T_SHORT: return sizeof(jshort);
3583   case T_CHAR:  return sizeof(jchar);
3584   case T_INT:   return sizeof(jint);
3585   default:
3586     ShouldNotReachHere();
3587     return -1;
3588   }
3589 }
3590 
3591 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3592   switch (eltype) {
3593   // T_BOOLEAN used as surrogate for unsigned byte
3594   case T_BOOLEAN: movzbl(dst, src);   break;
3595   case T_BYTE:    movsbl(dst, src);   break;
3596   case T_SHORT:   movswl(dst, src);   break;
3597   case T_CHAR:    movzwl(dst, src);   break;
3598   case T_INT:     movl(dst, src);     break;
3599   default:
3600     ShouldNotReachHere();
3601   }
3602 }
3603 
3604 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3605   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3606 }
3607 
3608 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3609   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3610 }
3611 
3612 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3613   const int vlen = Assembler::AVX_256bit;
3614   switch (eltype) {
3615   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3616   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3617   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3618   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3619   case T_INT:
3620     // do nothing
3621     break;
3622   default:
3623     ShouldNotReachHere();
3624   }
3625 }
3626 
3627 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3628                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3629                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3630                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3631                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3632                                         BasicType eltype) {
3633   ShortBranchVerifier sbv(this);
3634   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3635   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3636   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3637 
3638   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3639         SHORT_UNROLLED_LOOP_EXIT,
3640         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3641         UNROLLED_VECTOR_LOOP_BEGIN,
3642         END;
3643   switch (eltype) {
3644   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3645   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3646   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3647   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3648   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3649   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3650   }
3651 
3652   // For "renaming" for readibility of the code
3653   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3654                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3655                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3656 
3657   const int elsize = arrays_hashcode_elsize(eltype);
3658 
3659   /*
3660     if (cnt1 >= 2) {
3661       if (cnt1 >= 32) {
3662         UNROLLED VECTOR LOOP
3663       }
3664       UNROLLED SCALAR LOOP
3665     }
3666     SINGLE SCALAR
3667    */
3668 
3669   cmpl(cnt1, 32);
3670   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3671 
3672   // cnt1 >= 32 && generate_vectorized_loop
3673   xorl(index, index);
3674 
3675   // vresult = IntVector.zero(I256);
3676   for (int idx = 0; idx < 4; idx++) {
3677     vpxor(vresult[idx], vresult[idx]);
3678   }
3679   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3680   Register bound = tmp2;
3681   Register next = tmp3;
3682   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3683   movl(next, Address(tmp2, 0));
3684   movdl(vnext, next);
3685   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3686 
3687   // index = 0;
3688   // bound = cnt1 & ~(32 - 1);
3689   movl(bound, cnt1);
3690   andl(bound, ~(32 - 1));
3691   // for (; index < bound; index += 32) {
3692   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3693   // result *= next;
3694   imull(result, next);
3695   // loop fission to upfront the cost of fetching from memory, OOO execution
3696   // can then hopefully do a better job of prefetching
3697   for (int idx = 0; idx < 4; idx++) {
3698     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3699   }
3700   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3701   for (int idx = 0; idx < 4; idx++) {
3702     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3703     arrays_hashcode_elvcast(vtmp[idx], eltype);
3704     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3705   }
3706   // index += 32;
3707   addl(index, 32);
3708   // index < bound;
3709   cmpl(index, bound);
3710   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3711   // }
3712 
3713   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3714   subl(cnt1, bound);
3715   // release bound
3716 
3717   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3718   for (int idx = 0; idx < 4; idx++) {
3719     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3720     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3721     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3722   }
3723   // result += vresult.reduceLanes(ADD);
3724   for (int idx = 0; idx < 4; idx++) {
3725     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3726   }
3727 
3728   // } else if (cnt1 < 32) {
3729 
3730   bind(SHORT_UNROLLED_BEGIN);
3731   // int i = 1;
3732   movl(index, 1);
3733   cmpl(index, cnt1);
3734   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3735 
3736   // for (; i < cnt1 ; i += 2) {
3737   bind(SHORT_UNROLLED_LOOP_BEGIN);
3738   movl(tmp3, 961);
3739   imull(result, tmp3);
3740   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3741   movl(tmp3, tmp2);
3742   shll(tmp3, 5);
3743   subl(tmp3, tmp2);
3744   addl(result, tmp3);
3745   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3746   addl(result, tmp3);
3747   addl(index, 2);
3748   cmpl(index, cnt1);
3749   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3750 
3751   // }
3752   // if (i >= cnt1) {
3753   bind(SHORT_UNROLLED_LOOP_EXIT);
3754   jccb(Assembler::greater, END);
3755   movl(tmp2, result);
3756   shll(result, 5);
3757   subl(result, tmp2);
3758   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3759   addl(result, tmp3);
3760   // }
3761   bind(END);
3762 
3763   BLOCK_COMMENT("} // arrays_hashcode");
3764 
3765 } // arrays_hashcode
3766 
3767 // helper function for string_compare
3768 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3769                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3770                                            Address::ScaleFactor scale2, Register index, int ae) {
3771   if (ae == StrIntrinsicNode::LL) {
3772     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3773     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3774   } else if (ae == StrIntrinsicNode::UU) {
3775     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3776     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3777   } else {
3778     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3779     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3780   }
3781 }
3782 
3783 // Compare strings, used for char[] and byte[].
3784 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3785                                        Register cnt1, Register cnt2, Register result,
3786                                        XMMRegister vec1, int ae, KRegister mask) {
3787   ShortBranchVerifier sbv(this);
3788   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3789   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only AVX3
3790   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3791   int stride2x2 = 0x40;
3792   Address::ScaleFactor scale = Address::no_scale;
3793   Address::ScaleFactor scale1 = Address::no_scale;
3794   Address::ScaleFactor scale2 = Address::no_scale;
3795 
3796   if (ae != StrIntrinsicNode::LL) {
3797     stride2x2 = 0x20;
3798   }
3799 
3800   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3801     shrl(cnt2, 1);
3802   }
3803   // Compute the minimum of the string lengths and the
3804   // difference of the string lengths (stack).
3805   // Do the conditional move stuff
3806   movl(result, cnt1);
3807   subl(cnt1, cnt2);
3808   push(cnt1);
3809   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3810 
3811   // Is the minimum length zero?
3812   testl(cnt2, cnt2);
3813   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3814   if (ae == StrIntrinsicNode::LL) {
3815     // Load first bytes
3816     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3817     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3818   } else if (ae == StrIntrinsicNode::UU) {
3819     // Load first characters
3820     load_unsigned_short(result, Address(str1, 0));
3821     load_unsigned_short(cnt1, Address(str2, 0));
3822   } else {
3823     load_unsigned_byte(result, Address(str1, 0));
3824     load_unsigned_short(cnt1, Address(str2, 0));
3825   }
3826   subl(result, cnt1);
3827   jcc(Assembler::notZero,  POP_LABEL);
3828 
3829   if (ae == StrIntrinsicNode::UU) {
3830     // Divide length by 2 to get number of chars
3831     shrl(cnt2, 1);
3832   }
3833   cmpl(cnt2, 1);
3834   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3835 
3836   // Check if the strings start at the same location and setup scale and stride
3837   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3838     cmpptr(str1, str2);
3839     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3840     if (ae == StrIntrinsicNode::LL) {
3841       scale = Address::times_1;
3842       stride = 16;
3843     } else {
3844       scale = Address::times_2;
3845       stride = 8;
3846     }
3847   } else {
3848     scale1 = Address::times_1;
3849     scale2 = Address::times_2;
3850     // scale not used
3851     stride = 8;
3852   }
3853 
3854   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3855     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3856     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3857     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3858     Label COMPARE_TAIL_LONG;
3859     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only AVX3
3860 
3861     int pcmpmask = 0x19;
3862     if (ae == StrIntrinsicNode::LL) {
3863       pcmpmask &= ~0x01;
3864     }
3865 
3866     // Setup to compare 16-chars (32-bytes) vectors,
3867     // start from first character again because it has aligned address.
3868     if (ae == StrIntrinsicNode::LL) {
3869       stride2 = 32;
3870     } else {
3871       stride2 = 16;
3872     }
3873     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3874       adr_stride = stride << scale;
3875     } else {
3876       adr_stride1 = 8;  //stride << scale1;
3877       adr_stride2 = 16; //stride << scale2;
3878     }
3879 
3880     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3881     // rax and rdx are used by pcmpestri as elements counters
3882     movl(result, cnt2);
3883     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3884     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3885 
3886     // fast path : compare first 2 8-char vectors.
3887     bind(COMPARE_16_CHARS);
3888     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3889       movdqu(vec1, Address(str1, 0));
3890     } else {
3891       pmovzxbw(vec1, Address(str1, 0));
3892     }
3893     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3894     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3895 
3896     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3897       movdqu(vec1, Address(str1, adr_stride));
3898       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3899     } else {
3900       pmovzxbw(vec1, Address(str1, adr_stride1));
3901       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3902     }
3903     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3904     addl(cnt1, stride);
3905 
3906     // Compare the characters at index in cnt1
3907     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3908     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3909     subl(result, cnt2);
3910     jmp(POP_LABEL);
3911 
3912     // Setup the registers to start vector comparison loop
3913     bind(COMPARE_WIDE_VECTORS);
3914     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3915       lea(str1, Address(str1, result, scale));
3916       lea(str2, Address(str2, result, scale));
3917     } else {
3918       lea(str1, Address(str1, result, scale1));
3919       lea(str2, Address(str2, result, scale2));
3920     }
3921     subl(result, stride2);
3922     subl(cnt2, stride2);
3923     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3924     negptr(result);
3925 
3926     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3927     bind(COMPARE_WIDE_VECTORS_LOOP);
3928 
3929     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3930       cmpl(cnt2, stride2x2);
3931       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3932       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3933       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3934 
3935       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3936       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3937         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3938         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3939       } else {
3940         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3941         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3942       }
3943       kortestql(mask, mask);
3944       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3945       addptr(result, stride2x2);  // update since we already compared at this addr
3946       subl(cnt2, stride2x2);      // and sub the size too
3947       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3948 
3949       vpxor(vec1, vec1);
3950       jmpb(COMPARE_WIDE_TAIL);
3951     }//if (VM_Version::supports_avx512vlbw())
3952 
3953     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3954     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3955       vmovdqu(vec1, Address(str1, result, scale));
3956       vpxor(vec1, Address(str2, result, scale));
3957     } else {
3958       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3959       vpxor(vec1, Address(str2, result, scale2));
3960     }
3961     vptest(vec1, vec1);
3962     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3963     addptr(result, stride2);
3964     subl(cnt2, stride2);
3965     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3966     // clean upper bits of YMM registers
3967     vpxor(vec1, vec1);
3968 
3969     // compare wide vectors tail
3970     bind(COMPARE_WIDE_TAIL);
3971     testptr(result, result);
3972     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3973 
3974     movl(result, stride2);
3975     movl(cnt2, result);
3976     negptr(result);
3977     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3978 
3979     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3980     bind(VECTOR_NOT_EQUAL);
3981     // clean upper bits of YMM registers
3982     vpxor(vec1, vec1);
3983     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3984       lea(str1, Address(str1, result, scale));
3985       lea(str2, Address(str2, result, scale));
3986     } else {
3987       lea(str1, Address(str1, result, scale1));
3988       lea(str2, Address(str2, result, scale2));
3989     }
3990     jmp(COMPARE_16_CHARS);
3991 
3992     // Compare tail chars, length between 1 to 15 chars
3993     bind(COMPARE_TAIL_LONG);
3994     movl(cnt2, result);
3995     cmpl(cnt2, stride);
3996     jcc(Assembler::less, COMPARE_SMALL_STR);
3997 
3998     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3999       movdqu(vec1, Address(str1, 0));
4000     } else {
4001       pmovzxbw(vec1, Address(str1, 0));
4002     }
4003     pcmpestri(vec1, Address(str2, 0), pcmpmask);
4004     jcc(Assembler::below, COMPARE_INDEX_CHAR);
4005     subptr(cnt2, stride);
4006     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
4007     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4008       lea(str1, Address(str1, result, scale));
4009       lea(str2, Address(str2, result, scale));
4010     } else {
4011       lea(str1, Address(str1, result, scale1));
4012       lea(str2, Address(str2, result, scale2));
4013     }
4014     negptr(cnt2);
4015     jmpb(WHILE_HEAD_LABEL);
4016 
4017     bind(COMPARE_SMALL_STR);
4018   } else if (UseSSE42Intrinsics) {
4019     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
4020     int pcmpmask = 0x19;
4021     // Setup to compare 8-char (16-byte) vectors,
4022     // start from first character again because it has aligned address.
4023     movl(result, cnt2);
4024     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
4025     if (ae == StrIntrinsicNode::LL) {
4026       pcmpmask &= ~0x01;
4027     }
4028     jcc(Assembler::zero, COMPARE_TAIL);
4029     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4030       lea(str1, Address(str1, result, scale));
4031       lea(str2, Address(str2, result, scale));
4032     } else {
4033       lea(str1, Address(str1, result, scale1));
4034       lea(str2, Address(str2, result, scale2));
4035     }
4036     negptr(result);
4037 
4038     // pcmpestri
4039     //   inputs:
4040     //     vec1- substring
4041     //     rax - negative string length (elements count)
4042     //     mem - scanned string
4043     //     rdx - string length (elements count)
4044     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
4045     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
4046     //   outputs:
4047     //     rcx - first mismatched element index
4048     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
4049 
4050     bind(COMPARE_WIDE_VECTORS);
4051     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4052       movdqu(vec1, Address(str1, result, scale));
4053       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
4054     } else {
4055       pmovzxbw(vec1, Address(str1, result, scale1));
4056       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
4057     }
4058     // After pcmpestri cnt1(rcx) contains mismatched element index
4059 
4060     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
4061     addptr(result, stride);
4062     subptr(cnt2, stride);
4063     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4064 
4065     // compare wide vectors tail
4066     testptr(result, result);
4067     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
4068 
4069     movl(cnt2, stride);
4070     movl(result, stride);
4071     negptr(result);
4072     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4073       movdqu(vec1, Address(str1, result, scale));
4074       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
4075     } else {
4076       pmovzxbw(vec1, Address(str1, result, scale1));
4077       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
4078     }
4079     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
4080 
4081     // Mismatched characters in the vectors
4082     bind(VECTOR_NOT_EQUAL);
4083     addptr(cnt1, result);
4084     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
4085     subl(result, cnt2);
4086     jmpb(POP_LABEL);
4087 
4088     bind(COMPARE_TAIL); // limit is zero
4089     movl(cnt2, result);
4090     // Fallthru to tail compare
4091   }
4092   // Shift str2 and str1 to the end of the arrays, negate min
4093   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4094     lea(str1, Address(str1, cnt2, scale));
4095     lea(str2, Address(str2, cnt2, scale));
4096   } else {
4097     lea(str1, Address(str1, cnt2, scale1));
4098     lea(str2, Address(str2, cnt2, scale2));
4099   }
4100   decrementl(cnt2);  // first character was compared already
4101   negptr(cnt2);
4102 
4103   // Compare the rest of the elements
4104   bind(WHILE_HEAD_LABEL);
4105   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
4106   subl(result, cnt1);
4107   jccb(Assembler::notZero, POP_LABEL);
4108   increment(cnt2);
4109   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
4110 
4111   // Strings are equal up to min length.  Return the length difference.
4112   bind(LENGTH_DIFF_LABEL);
4113   pop(result);
4114   if (ae == StrIntrinsicNode::UU) {
4115     // Divide diff by 2 to get number of chars
4116     sarl(result, 1);
4117   }
4118   jmpb(DONE_LABEL);
4119 
4120   if (VM_Version::supports_avx512vlbw()) {
4121 
4122     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
4123 
4124     kmovql(cnt1, mask);
4125     notq(cnt1);
4126     bsfq(cnt2, cnt1);
4127     if (ae != StrIntrinsicNode::LL) {
4128       // Divide diff by 2 to get number of chars
4129       sarl(cnt2, 1);
4130     }
4131     addq(result, cnt2);
4132     if (ae == StrIntrinsicNode::LL) {
4133       load_unsigned_byte(cnt1, Address(str2, result));
4134       load_unsigned_byte(result, Address(str1, result));
4135     } else if (ae == StrIntrinsicNode::UU) {
4136       load_unsigned_short(cnt1, Address(str2, result, scale));
4137       load_unsigned_short(result, Address(str1, result, scale));
4138     } else {
4139       load_unsigned_short(cnt1, Address(str2, result, scale2));
4140       load_unsigned_byte(result, Address(str1, result, scale1));
4141     }
4142     subl(result, cnt1);
4143     jmpb(POP_LABEL);
4144   }//if (VM_Version::supports_avx512vlbw())
4145 
4146   // Discard the stored length difference
4147   bind(POP_LABEL);
4148   pop(cnt1);
4149 
4150   // That's it
4151   bind(DONE_LABEL);
4152   if(ae == StrIntrinsicNode::UL) {
4153     negl(result);
4154   }
4155 
4156 }
4157 
4158 // Search for Non-ASCII character (Negative byte value) in a byte array,
4159 // return the index of the first such character, otherwise the length
4160 // of the array segment searched.
4161 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4162 //   @IntrinsicCandidate
4163 //   public static int countPositives(byte[] ba, int off, int len) {
4164 //     for (int i = off; i < off + len; i++) {
4165 //       if (ba[i] < 0) {
4166 //         return i - off;
4167 //       }
4168 //     }
4169 //     return len;
4170 //   }
4171 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4172   Register result, Register tmp1,
4173   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4174   // rsi: byte array
4175   // rcx: len
4176   // rax: result
4177   ShortBranchVerifier sbv(this);
4178   assert_different_registers(ary1, len, result, tmp1);
4179   assert_different_registers(vec1, vec2);
4180   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4181 
4182   movl(result, len); // copy
4183   // len == 0
4184   testl(len, len);
4185   jcc(Assembler::zero, DONE);
4186 
4187   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4188     VM_Version::supports_avx512vlbw() &&
4189     VM_Version::supports_bmi2()) {
4190 
4191     Label test_64_loop, test_tail, BREAK_LOOP;
4192     movl(tmp1, len);
4193     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4194 
4195     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4196     andl(len,  0xffffffc0); // vector count (in chars)
4197     jccb(Assembler::zero, test_tail);
4198 
4199     lea(ary1, Address(ary1, len, Address::times_1));
4200     negptr(len);
4201 
4202     bind(test_64_loop);
4203     // Check whether our 64 elements of size byte contain negatives
4204     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4205     kortestql(mask1, mask1);
4206     jcc(Assembler::notZero, BREAK_LOOP);
4207 
4208     addptr(len, 64);
4209     jccb(Assembler::notZero, test_64_loop);
4210 
4211     bind(test_tail);
4212     // bail out when there is nothing to be done
4213     testl(tmp1, -1);
4214     jcc(Assembler::zero, DONE);
4215 
4216 
4217     // check the tail for absense of negatives
4218     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4219     {
4220       Register tmp3_aliased = len;
4221       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4222       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4223       notq(tmp3_aliased);
4224       kmovql(mask2, tmp3_aliased);
4225     }
4226 
4227     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4228     ktestq(mask1, mask2);
4229     jcc(Assembler::zero, DONE);
4230 
4231     // do a full check for negative registers in the tail
4232     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4233                      // ary1 already pointing to the right place
4234     jmpb(TAIL_START);
4235 
4236     bind(BREAK_LOOP);
4237     // At least one byte in the last 64 byte block was negative.
4238     // Set up to look at the last 64 bytes as if they were a tail
4239     lea(ary1, Address(ary1, len, Address::times_1));
4240     addptr(result, len);
4241     // Ignore the very last byte: if all others are positive,
4242     // it must be negative, so we can skip right to the 2+1 byte
4243     // end comparison at this point
4244     orl(result, 63);
4245     movl(len, 63);
4246     // Fallthru to tail compare
4247   } else {
4248 
4249     if (UseAVX >= 2) {
4250       // With AVX2, use 32-byte vector compare
4251       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4252 
4253       // Compare 32-byte vectors
4254       testl(len, 0xffffffe0);   // vector count (in bytes)
4255       jccb(Assembler::zero, TAIL_START);
4256 
4257       andl(len, 0xffffffe0);
4258       lea(ary1, Address(ary1, len, Address::times_1));
4259       negptr(len);
4260 
4261       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4262       movdl(vec2, tmp1);
4263       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4264 
4265       bind(COMPARE_WIDE_VECTORS);
4266       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4267       vptest(vec1, vec2);
4268       jccb(Assembler::notZero, BREAK_LOOP);
4269       addptr(len, 32);
4270       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4271 
4272       testl(result, 0x0000001f);   // any bytes remaining?
4273       jcc(Assembler::zero, DONE);
4274 
4275       // Quick test using the already prepared vector mask
4276       movl(len, result);
4277       andl(len, 0x0000001f);
4278       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4279       vptest(vec1, vec2);
4280       jcc(Assembler::zero, DONE);
4281       // There are zeros, jump to the tail to determine exactly where
4282       jmpb(TAIL_START);
4283 
4284       bind(BREAK_LOOP);
4285       // At least one byte in the last 32-byte vector is negative.
4286       // Set up to look at the last 32 bytes as if they were a tail
4287       lea(ary1, Address(ary1, len, Address::times_1));
4288       addptr(result, len);
4289       // Ignore the very last byte: if all others are positive,
4290       // it must be negative, so we can skip right to the 2+1 byte
4291       // end comparison at this point
4292       orl(result, 31);
4293       movl(len, 31);
4294       // Fallthru to tail compare
4295     } else if (UseSSE42Intrinsics) {
4296       // With SSE4.2, use double quad vector compare
4297       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4298 
4299       // Compare 16-byte vectors
4300       testl(len, 0xfffffff0);   // vector count (in bytes)
4301       jcc(Assembler::zero, TAIL_START);
4302 
4303       andl(len, 0xfffffff0);
4304       lea(ary1, Address(ary1, len, Address::times_1));
4305       negptr(len);
4306 
4307       movl(tmp1, 0x80808080);
4308       movdl(vec2, tmp1);
4309       pshufd(vec2, vec2, 0);
4310 
4311       bind(COMPARE_WIDE_VECTORS);
4312       movdqu(vec1, Address(ary1, len, Address::times_1));
4313       ptest(vec1, vec2);
4314       jccb(Assembler::notZero, BREAK_LOOP);
4315       addptr(len, 16);
4316       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4317 
4318       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4319       jcc(Assembler::zero, DONE);
4320 
4321       // Quick test using the already prepared vector mask
4322       movl(len, result);
4323       andl(len, 0x0000000f);   // tail count (in bytes)
4324       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4325       ptest(vec1, vec2);
4326       jcc(Assembler::zero, DONE);
4327       jmpb(TAIL_START);
4328 
4329       bind(BREAK_LOOP);
4330       // At least one byte in the last 16-byte vector is negative.
4331       // Set up and look at the last 16 bytes as if they were a tail
4332       lea(ary1, Address(ary1, len, Address::times_1));
4333       addptr(result, len);
4334       // Ignore the very last byte: if all others are positive,
4335       // it must be negative, so we can skip right to the 2+1 byte
4336       // end comparison at this point
4337       orl(result, 15);
4338       movl(len, 15);
4339       // Fallthru to tail compare
4340     }
4341   }
4342 
4343   bind(TAIL_START);
4344   // Compare 4-byte vectors
4345   andl(len, 0xfffffffc); // vector count (in bytes)
4346   jccb(Assembler::zero, COMPARE_CHAR);
4347 
4348   lea(ary1, Address(ary1, len, Address::times_1));
4349   negptr(len);
4350 
4351   bind(COMPARE_VECTORS);
4352   movl(tmp1, Address(ary1, len, Address::times_1));
4353   andl(tmp1, 0x80808080);
4354   jccb(Assembler::notZero, TAIL_ADJUST);
4355   addptr(len, 4);
4356   jccb(Assembler::notZero, COMPARE_VECTORS);
4357 
4358   // Compare trailing char (final 2-3 bytes), if any
4359   bind(COMPARE_CHAR);
4360 
4361   testl(result, 0x2);   // tail  char
4362   jccb(Assembler::zero, COMPARE_BYTE);
4363   load_unsigned_short(tmp1, Address(ary1, 0));
4364   andl(tmp1, 0x00008080);
4365   jccb(Assembler::notZero, CHAR_ADJUST);
4366   lea(ary1, Address(ary1, 2));
4367 
4368   bind(COMPARE_BYTE);
4369   testl(result, 0x1);   // tail  byte
4370   jccb(Assembler::zero, DONE);
4371   load_unsigned_byte(tmp1, Address(ary1, 0));
4372   testl(tmp1, 0x00000080);
4373   jccb(Assembler::zero, DONE);
4374   subptr(result, 1);
4375   jmpb(DONE);
4376 
4377   bind(TAIL_ADJUST);
4378   // there are negative bits in the last 4 byte block.
4379   // Adjust result and check the next three bytes
4380   addptr(result, len);
4381   orl(result, 3);
4382   lea(ary1, Address(ary1, len, Address::times_1));
4383   jmpb(COMPARE_CHAR);
4384 
4385   bind(CHAR_ADJUST);
4386   // We are looking at a char + optional byte tail, and found that one
4387   // of the bytes in the char is negative. Adjust the result, check the
4388   // first byte and readjust if needed.
4389   andl(result, 0xfffffffc);
4390   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4391   jccb(Assembler::notZero, DONE);
4392   addptr(result, 1);
4393 
4394   // That's it
4395   bind(DONE);
4396   if (UseAVX >= 2) {
4397     // clean upper bits of YMM registers
4398     vpxor(vec1, vec1);
4399     vpxor(vec2, vec2);
4400   }
4401 }
4402 
4403 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4404 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4405                                       Register limit, Register result, Register chr,
4406                                       XMMRegister vec1, XMMRegister vec2, bool is_char,
4407                                       KRegister mask, bool expand_ary2) {
4408   // for expand_ary2, limit is the (smaller) size of the second array.
4409   ShortBranchVerifier sbv(this);
4410   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4411 
4412   assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4413          "Expansion only implemented for AVX2");
4414 
4415   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4416   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4417 
4418   Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4419   int scaleIncr = expand_ary2 ? 8 : 16;
4420 
4421   if (is_array_equ) {
4422     // Check the input args
4423     cmpoop(ary1, ary2);
4424     jcc(Assembler::equal, TRUE_LABEL);
4425 
4426     // Need additional checks for arrays_equals.
4427     testptr(ary1, ary1);
4428     jcc(Assembler::zero, FALSE_LABEL);
4429     testptr(ary2, ary2);
4430     jcc(Assembler::zero, FALSE_LABEL);
4431 
4432     // Check the lengths
4433     movl(limit, Address(ary1, length_offset));
4434     cmpl(limit, Address(ary2, length_offset));
4435     jcc(Assembler::notEqual, FALSE_LABEL);
4436   }
4437 
4438   // count == 0
4439   testl(limit, limit);
4440   jcc(Assembler::zero, TRUE_LABEL);
4441 
4442   if (is_array_equ) {
4443     // Load array address
4444     lea(ary1, Address(ary1, base_offset));
4445     lea(ary2, Address(ary2, base_offset));
4446   }
4447 
4448   if (is_array_equ && is_char) {
4449     // arrays_equals when used for char[].
4450     shll(limit, 1);      // byte count != 0
4451   }
4452   movl(result, limit); // copy
4453 
4454   if (UseAVX >= 2) {
4455     // With AVX2, use 32-byte vector compare
4456     Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4457 
4458     // Compare 32-byte vectors
4459     if (expand_ary2) {
4460       andl(result, 0x0000000f);  //   tail count (in bytes)
4461       andl(limit, 0xfffffff0);   // vector count (in bytes)
4462       jcc(Assembler::zero, COMPARE_TAIL);
4463     } else {
4464       andl(result, 0x0000001f);  //   tail count (in bytes)
4465       andl(limit, 0xffffffe0);   // vector count (in bytes)
4466       jcc(Assembler::zero, COMPARE_TAIL_16);
4467     }
4468 
4469     lea(ary1, Address(ary1, limit, scaleFactor));
4470     lea(ary2, Address(ary2, limit, Address::times_1));
4471     negptr(limit);
4472 
4473     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4474       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4475 
4476       cmpl(limit, -64);
4477       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4478 
4479       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4480 
4481       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4482       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4483       kortestql(mask, mask);
4484       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4485       addptr(limit, 64);  // update since we already compared at this addr
4486       cmpl(limit, -64);
4487       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4488 
4489       // At this point we may still need to compare -limit+result bytes.
4490       // We could execute the next two instruction and just continue via non-wide path:
4491       //  cmpl(limit, 0);
4492       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4493       // But since we stopped at the points ary{1,2}+limit which are
4494       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4495       // (|limit| <= 32 and result < 32),
4496       // we may just compare the last 64 bytes.
4497       //
4498       addptr(result, -64);   // it is safe, bc we just came from this area
4499       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4500       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4501       kortestql(mask, mask);
4502       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4503 
4504       jmp(TRUE_LABEL);
4505 
4506       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4507 
4508     }//if (VM_Version::supports_avx512vlbw())
4509 
4510     bind(COMPARE_WIDE_VECTORS);
4511     vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4512     if (expand_ary2) {
4513       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4514     } else {
4515       vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4516     }
4517     vpxor(vec1, vec2);
4518 
4519     vptest(vec1, vec1);
4520     jcc(Assembler::notZero, FALSE_LABEL);
4521     addptr(limit, scaleIncr * 2);
4522     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4523 
4524     testl(result, result);
4525     jcc(Assembler::zero, TRUE_LABEL);
4526 
4527     vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4528     if (expand_ary2) {
4529       vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4530     } else {
4531       vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4532     }
4533     vpxor(vec1, vec2);
4534 
4535     vptest(vec1, vec1);
4536     jcc(Assembler::notZero, FALSE_LABEL);
4537     jmp(TRUE_LABEL);
4538 
4539     bind(COMPARE_TAIL_16); // limit is zero
4540     movl(limit, result);
4541 
4542     // Compare 16-byte chunks
4543     andl(result, 0x0000000f);  //   tail count (in bytes)
4544     andl(limit, 0xfffffff0);   // vector count (in bytes)
4545     jcc(Assembler::zero, COMPARE_TAIL);
4546 
4547     lea(ary1, Address(ary1, limit, scaleFactor));
4548     lea(ary2, Address(ary2, limit, Address::times_1));
4549     negptr(limit);
4550 
4551     bind(COMPARE_WIDE_VECTORS_16);
4552     movdqu(vec1, Address(ary1, limit, scaleFactor));
4553     if (expand_ary2) {
4554       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4555     } else {
4556       movdqu(vec2, Address(ary2, limit, Address::times_1));
4557     }
4558     pxor(vec1, vec2);
4559 
4560     ptest(vec1, vec1);
4561     jcc(Assembler::notZero, FALSE_LABEL);
4562     addptr(limit, scaleIncr);
4563     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4564 
4565     bind(COMPARE_TAIL); // limit is zero
4566     movl(limit, result);
4567     // Fallthru to tail compare
4568   } else if (UseSSE42Intrinsics) {
4569     // With SSE4.2, use double quad vector compare
4570     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4571 
4572     // Compare 16-byte vectors
4573     andl(result, 0x0000000f);  //   tail count (in bytes)
4574     andl(limit, 0xfffffff0);   // vector count (in bytes)
4575     jcc(Assembler::zero, COMPARE_TAIL);
4576 
4577     lea(ary1, Address(ary1, limit, Address::times_1));
4578     lea(ary2, Address(ary2, limit, Address::times_1));
4579     negptr(limit);
4580 
4581     bind(COMPARE_WIDE_VECTORS);
4582     movdqu(vec1, Address(ary1, limit, Address::times_1));
4583     movdqu(vec2, Address(ary2, limit, Address::times_1));
4584     pxor(vec1, vec2);
4585 
4586     ptest(vec1, vec1);
4587     jcc(Assembler::notZero, FALSE_LABEL);
4588     addptr(limit, 16);
4589     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4590 
4591     testl(result, result);
4592     jcc(Assembler::zero, TRUE_LABEL);
4593 
4594     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4595     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4596     pxor(vec1, vec2);
4597 
4598     ptest(vec1, vec1);
4599     jccb(Assembler::notZero, FALSE_LABEL);
4600     jmpb(TRUE_LABEL);
4601 
4602     bind(COMPARE_TAIL); // limit is zero
4603     movl(limit, result);
4604     // Fallthru to tail compare
4605   }
4606 
4607   // Compare 4-byte vectors
4608   if (expand_ary2) {
4609     testl(result, result);
4610     jccb(Assembler::zero, TRUE_LABEL);
4611   } else {
4612     andl(limit, 0xfffffffc); // vector count (in bytes)
4613     jccb(Assembler::zero, COMPARE_CHAR);
4614   }
4615 
4616   lea(ary1, Address(ary1, limit, scaleFactor));
4617   lea(ary2, Address(ary2, limit, Address::times_1));
4618   negptr(limit);
4619 
4620   bind(COMPARE_VECTORS);
4621   if (expand_ary2) {
4622     // There are no "vector" operations for bytes to shorts
4623     movzbl(chr, Address(ary2, limit, Address::times_1));
4624     cmpw(Address(ary1, limit, Address::times_2), chr);
4625     jccb(Assembler::notEqual, FALSE_LABEL);
4626     addptr(limit, 1);
4627     jcc(Assembler::notZero, COMPARE_VECTORS);
4628     jmp(TRUE_LABEL);
4629   } else {
4630     movl(chr, Address(ary1, limit, Address::times_1));
4631     cmpl(chr, Address(ary2, limit, Address::times_1));
4632     jccb(Assembler::notEqual, FALSE_LABEL);
4633     addptr(limit, 4);
4634     jcc(Assembler::notZero, COMPARE_VECTORS);
4635   }
4636 
4637   // Compare trailing char (final 2 bytes), if any
4638   bind(COMPARE_CHAR);
4639   testl(result, 0x2);   // tail  char
4640   jccb(Assembler::zero, COMPARE_BYTE);
4641   load_unsigned_short(chr, Address(ary1, 0));
4642   load_unsigned_short(limit, Address(ary2, 0));
4643   cmpl(chr, limit);
4644   jccb(Assembler::notEqual, FALSE_LABEL);
4645 
4646   if (is_array_equ && is_char) {
4647     bind(COMPARE_BYTE);
4648   } else {
4649     lea(ary1, Address(ary1, 2));
4650     lea(ary2, Address(ary2, 2));
4651 
4652     bind(COMPARE_BYTE);
4653     testl(result, 0x1);   // tail  byte
4654     jccb(Assembler::zero, TRUE_LABEL);
4655     load_unsigned_byte(chr, Address(ary1, 0));
4656     load_unsigned_byte(limit, Address(ary2, 0));
4657     cmpl(chr, limit);
4658     jccb(Assembler::notEqual, FALSE_LABEL);
4659   }
4660   bind(TRUE_LABEL);
4661   movl(result, 1);   // return true
4662   jmpb(DONE);
4663 
4664   bind(FALSE_LABEL);
4665   xorl(result, result); // return false
4666 
4667   // That's it
4668   bind(DONE);
4669   if (UseAVX >= 2) {
4670     // clean upper bits of YMM registers
4671     vpxor(vec1, vec1);
4672     vpxor(vec2, vec2);
4673   }
4674 }
4675 
4676 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4677 #define __ masm.
4678   Register dst = stub.data<0>();
4679   XMMRegister src = stub.data<1>();
4680   address target = stub.data<2>();
4681   __ bind(stub.entry());
4682   __ subptr(rsp, 8);
4683   __ movdbl(Address(rsp), src);
4684   __ call(RuntimeAddress(target));
4685   // APX REX2 encoding for pop(dst) increases the stub size by 1 byte.
4686   __ pop(dst);
4687   __ jmp(stub.continuation());
4688 #undef __
4689 }
4690 
4691 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4692   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4693   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4694 
4695   address slowpath_target;
4696   if (dst_bt == T_INT) {
4697     if (src_bt == T_FLOAT) {
4698       cvttss2sil(dst, src);
4699       cmpl(dst, 0x80000000);
4700       slowpath_target = StubRoutines::x86::f2i_fixup();
4701     } else {
4702       cvttsd2sil(dst, src);
4703       cmpl(dst, 0x80000000);
4704       slowpath_target = StubRoutines::x86::d2i_fixup();
4705     }
4706   } else {
4707     if (src_bt == T_FLOAT) {
4708       cvttss2siq(dst, src);
4709       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4710       slowpath_target = StubRoutines::x86::f2l_fixup();
4711     } else {
4712       cvttsd2siq(dst, src);
4713       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4714       slowpath_target = StubRoutines::x86::d2l_fixup();
4715     }
4716   }
4717 
4718   // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte.
4719   int max_size = 23 + (UseAPX ? 1 : 0);
4720   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath);
4721   jcc(Assembler::equal, stub->entry());
4722   bind(stub->continuation());
4723 }
4724 
4725 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4726                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4727   switch(ideal_opc) {
4728     case Op_LShiftVS:
4729       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4730     case Op_LShiftVI:
4731       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4732     case Op_LShiftVL:
4733       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4734     case Op_RShiftVS:
4735       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4736     case Op_RShiftVI:
4737       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4738     case Op_RShiftVL:
4739       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4740     case Op_URShiftVS:
4741       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4742     case Op_URShiftVI:
4743       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4744     case Op_URShiftVL:
4745       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4746     case Op_RotateRightV:
4747       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4748     case Op_RotateLeftV:
4749       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4750     default:
4751       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4752       break;
4753   }
4754 }
4755 
4756 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4757                                                XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4758   if (is_unsigned) {
4759     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4760   } else {
4761     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4762   }
4763 }
4764 
4765 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4766                                                       XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4767   switch (elem_bt) {
4768     case T_BYTE:
4769       if (ideal_opc == Op_SaturatingAddV) {
4770         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4771       } else {
4772         assert(ideal_opc == Op_SaturatingSubV, "");
4773         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4774       }
4775       break;
4776     case T_SHORT:
4777       if (ideal_opc == Op_SaturatingAddV) {
4778         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4779       } else {
4780         assert(ideal_opc == Op_SaturatingSubV, "");
4781         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4782       }
4783       break;
4784     default:
4785       fatal("Unsupported type %s", type2name(elem_bt));
4786       break;
4787   }
4788 }
4789 
4790 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4791                                                         XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4792   switch (elem_bt) {
4793     case T_BYTE:
4794       if (ideal_opc == Op_SaturatingAddV) {
4795         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4796       } else {
4797         assert(ideal_opc == Op_SaturatingSubV, "");
4798         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4799       }
4800       break;
4801     case T_SHORT:
4802       if (ideal_opc == Op_SaturatingAddV) {
4803         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4804       } else {
4805         assert(ideal_opc == Op_SaturatingSubV, "");
4806         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4807       }
4808       break;
4809     default:
4810       fatal("Unsupported type %s", type2name(elem_bt));
4811       break;
4812   }
4813 }
4814 
4815 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4816                                                Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4817   if (is_unsigned) {
4818     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4819   } else {
4820     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4821   }
4822 }
4823 
4824 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4825                                                       XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4826   switch (elem_bt) {
4827     case T_BYTE:
4828       if (ideal_opc == Op_SaturatingAddV) {
4829         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4830       } else {
4831         assert(ideal_opc == Op_SaturatingSubV, "");
4832         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4833       }
4834       break;
4835     case T_SHORT:
4836       if (ideal_opc == Op_SaturatingAddV) {
4837         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4838       } else {
4839         assert(ideal_opc == Op_SaturatingSubV, "");
4840         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4841       }
4842       break;
4843     default:
4844       fatal("Unsupported type %s", type2name(elem_bt));
4845       break;
4846   }
4847 }
4848 
4849 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4850                                                         XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4851   switch (elem_bt) {
4852     case T_BYTE:
4853       if (ideal_opc == Op_SaturatingAddV) {
4854         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4855       } else {
4856         assert(ideal_opc == Op_SaturatingSubV, "");
4857         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4858       }
4859       break;
4860     case T_SHORT:
4861       if (ideal_opc == Op_SaturatingAddV) {
4862         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4863       } else {
4864         assert(ideal_opc == Op_SaturatingSubV, "");
4865         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4866       }
4867       break;
4868     default:
4869       fatal("Unsupported type %s", type2name(elem_bt));
4870       break;
4871   }
4872 }
4873 
4874 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4875                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4876                                     bool is_varshift) {
4877   switch (ideal_opc) {
4878     case Op_AddVB:
4879       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4880     case Op_AddVS:
4881       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4882     case Op_AddVI:
4883       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4884     case Op_AddVL:
4885       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4886     case Op_AddVF:
4887       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4888     case Op_AddVD:
4889       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4890     case Op_SubVB:
4891       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4892     case Op_SubVS:
4893       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4894     case Op_SubVI:
4895       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4896     case Op_SubVL:
4897       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4898     case Op_SubVF:
4899       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4900     case Op_SubVD:
4901       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4902     case Op_MulVS:
4903       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4904     case Op_MulVI:
4905       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4906     case Op_MulVL:
4907       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4908     case Op_MulVF:
4909       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4910     case Op_MulVD:
4911       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4912     case Op_DivVF:
4913       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4914     case Op_DivVD:
4915       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4916     case Op_SqrtVF:
4917       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4918     case Op_SqrtVD:
4919       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4920     case Op_AbsVB:
4921       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4922     case Op_AbsVS:
4923       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4924     case Op_AbsVI:
4925       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4926     case Op_AbsVL:
4927       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4928     case Op_FmaVF:
4929       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4930     case Op_FmaVD:
4931       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4932     case Op_VectorRearrange:
4933       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4934     case Op_LShiftVS:
4935       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4936     case Op_LShiftVI:
4937       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4938     case Op_LShiftVL:
4939       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4940     case Op_RShiftVS:
4941       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4942     case Op_RShiftVI:
4943       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4944     case Op_RShiftVL:
4945       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4946     case Op_URShiftVS:
4947       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4948     case Op_URShiftVI:
4949       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4950     case Op_URShiftVL:
4951       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4952     case Op_RotateLeftV:
4953       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4954     case Op_RotateRightV:
4955       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4956     case Op_MaxV:
4957       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4958     case Op_MinV:
4959       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4960     case Op_UMinV:
4961       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4962     case Op_UMaxV:
4963       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4964     case Op_XorV:
4965       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4966     case Op_OrV:
4967       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4968     case Op_AndV:
4969       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4970     default:
4971       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4972       break;
4973   }
4974 }
4975 
4976 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4977                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4978   switch (ideal_opc) {
4979     case Op_AddVB:
4980       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4981     case Op_AddVS:
4982       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4983     case Op_AddVI:
4984       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4985     case Op_AddVL:
4986       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4987     case Op_AddVF:
4988       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4989     case Op_AddVD:
4990       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4991     case Op_SubVB:
4992       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4993     case Op_SubVS:
4994       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4995     case Op_SubVI:
4996       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4997     case Op_SubVL:
4998       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4999     case Op_SubVF:
5000       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
5001     case Op_SubVD:
5002       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
5003     case Op_MulVS:
5004       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
5005     case Op_MulVI:
5006       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
5007     case Op_MulVL:
5008       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
5009     case Op_MulVF:
5010       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
5011     case Op_MulVD:
5012       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
5013     case Op_DivVF:
5014       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
5015     case Op_DivVD:
5016       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
5017     case Op_FmaVF:
5018       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
5019     case Op_FmaVD:
5020       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
5021     case Op_MaxV:
5022       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5023     case Op_MinV:
5024       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5025     case Op_UMaxV:
5026       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5027     case Op_UMinV:
5028       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5029     case Op_XorV:
5030       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5031     case Op_OrV:
5032       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5033     case Op_AndV:
5034       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5035     default:
5036       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
5037       break;
5038   }
5039 }
5040 
5041 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
5042                                   KRegister src1, KRegister src2) {
5043   BasicType etype = T_ILLEGAL;
5044   switch(mask_len) {
5045     case 2:
5046     case 4:
5047     case 8:  etype = T_BYTE; break;
5048     case 16: etype = T_SHORT; break;
5049     case 32: etype = T_INT; break;
5050     case 64: etype = T_LONG; break;
5051     default: fatal("Unsupported type"); break;
5052   }
5053   assert(etype != T_ILLEGAL, "");
5054   switch(ideal_opc) {
5055     case Op_AndVMask:
5056       kand(etype, dst, src1, src2); break;
5057     case Op_OrVMask:
5058       kor(etype, dst, src1, src2); break;
5059     case Op_XorVMask:
5060       kxor(etype, dst, src1, src2); break;
5061     default:
5062       fatal("Unsupported masked operation"); break;
5063   }
5064 }
5065 
5066 /*
5067  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5068  * If src is NaN, the result is 0.
5069  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
5070  * the result is equal to the value of Integer.MIN_VALUE.
5071  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
5072  * the result is equal to the value of Integer.MAX_VALUE.
5073  */
5074 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5075                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5076                                                                    Register rscratch, AddressLiteral float_sign_flip,
5077                                                                    int vec_enc) {
5078   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5079   Label done;
5080   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
5081   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
5082   vptest(xtmp2, xtmp2, vec_enc);
5083   jccb(Assembler::equal, done);
5084 
5085   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
5086   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
5087 
5088   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5089   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
5090   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
5091 
5092   // Recompute the mask for remaining special value.
5093   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
5094   // Extract SRC values corresponding to TRUE mask lanes.
5095   vpand(xtmp4, xtmp2, src, vec_enc);
5096   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
5097   // values are set.
5098   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
5099 
5100   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
5101   bind(done);
5102 }
5103 
5104 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5105                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5106                                                                     Register rscratch, AddressLiteral float_sign_flip,
5107                                                                     int vec_enc) {
5108   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5109   Label done;
5110   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5111   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5112   kortestwl(ktmp1, ktmp1);
5113   jccb(Assembler::equal, done);
5114 
5115   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5116   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5117   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5118 
5119   kxorwl(ktmp1, ktmp1, ktmp2);
5120   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5121   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5122   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5123   bind(done);
5124 }
5125 
5126 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5127                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5128                                                                      Register rscratch, AddressLiteral double_sign_flip,
5129                                                                      int vec_enc) {
5130   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5131 
5132   Label done;
5133   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5134   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
5135   kortestwl(ktmp1, ktmp1);
5136   jccb(Assembler::equal, done);
5137 
5138   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5139   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5140   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5141 
5142   kxorwl(ktmp1, ktmp1, ktmp2);
5143   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5144   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5145   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5146   bind(done);
5147 }
5148 
5149 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5150                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5151                                                                      Register rscratch, AddressLiteral float_sign_flip,
5152                                                                      int vec_enc) {
5153   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5154   Label done;
5155   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5156   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5157   kortestwl(ktmp1, ktmp1);
5158   jccb(Assembler::equal, done);
5159 
5160   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5161   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5162   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5163 
5164   kxorwl(ktmp1, ktmp1, ktmp2);
5165   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5166   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5167   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5168   bind(done);
5169 }
5170 
5171 /*
5172  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5173  * If src is NaN, the result is 0.
5174  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
5175  * the result is equal to the value of Long.MIN_VALUE.
5176  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
5177  * the result is equal to the value of Long.MAX_VALUE.
5178  */
5179 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5180                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5181                                                                       Register rscratch, AddressLiteral double_sign_flip,
5182                                                                       int vec_enc) {
5183   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5184 
5185   Label done;
5186   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5187   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
5188   kortestwl(ktmp1, ktmp1);
5189   jccb(Assembler::equal, done);
5190 
5191   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5192   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5193   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5194 
5195   kxorwl(ktmp1, ktmp1, ktmp2);
5196   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5197   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5198   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5199   bind(done);
5200 }
5201 
5202 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5203                                                              XMMRegister xtmp, int index, int vec_enc) {
5204    assert(vec_enc < Assembler::AVX_512bit, "");
5205    if (vec_enc == Assembler::AVX_256bit) {
5206      vextractf128_high(xtmp, src);
5207      vshufps(dst, src, xtmp, index, vec_enc);
5208    } else {
5209      vshufps(dst, src, zero, index, vec_enc);
5210    }
5211 }
5212 
5213 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5214                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5215                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
5216   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5217 
5218   Label done;
5219   // Compare the destination lanes with float_sign_flip
5220   // value to get mask for all special values.
5221   movdqu(xtmp1, float_sign_flip, rscratch);
5222   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5223   ptest(xtmp2, xtmp2);
5224   jccb(Assembler::equal, done);
5225 
5226   // Flip float_sign_flip to get max integer value.
5227   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5228   pxor(xtmp1, xtmp4);
5229 
5230   // Set detination lanes corresponding to unordered source lanes as zero.
5231   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5232   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5233 
5234   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5235   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5236   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5237 
5238   // Recompute the mask for remaining special value.
5239   pxor(xtmp2, xtmp3);
5240   // Extract mask corresponding to non-negative source lanes.
5241   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5242 
5243   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5244   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5245   pand(xtmp3, xtmp2);
5246 
5247   // Replace destination lanes holding special value(0x80000000) with max int
5248   // if corresponding source lane holds a +ve value.
5249   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5250   bind(done);
5251 }
5252 
5253 
5254 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5255                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5256   switch(to_elem_bt) {
5257     case T_SHORT:
5258       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5259       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5260       vpackusdw(dst, dst, zero, vec_enc);
5261       if (vec_enc == Assembler::AVX_256bit) {
5262         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5263       }
5264       break;
5265     case  T_BYTE:
5266       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5267       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5268       vpackusdw(dst, dst, zero, vec_enc);
5269       if (vec_enc == Assembler::AVX_256bit) {
5270         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5271       }
5272       vpackuswb(dst, dst, zero, vec_enc);
5273       break;
5274     default: assert(false, "%s", type2name(to_elem_bt));
5275   }
5276 }
5277 
5278 /*
5279  * Algorithm for vector D2L and F2I conversions:-
5280  * a) Perform vector D2L/F2I cast.
5281  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5282  *    It signifies that source value could be any of the special floating point
5283  *    values(NaN,-Inf,Inf,Max,-Min).
5284  * c) Set destination to zero if source is NaN value.
5285  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5286  */
5287 
5288 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5289                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5290                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5291   int to_elem_sz = type2aelembytes(to_elem_bt);
5292   assert(to_elem_sz <= 4, "");
5293   vcvttps2dq(dst, src, vec_enc);
5294   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5295   if (to_elem_sz < 4) {
5296     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5297     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5298   }
5299 }
5300 
5301 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5302                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5303                                             Register rscratch, int vec_enc) {
5304   int to_elem_sz = type2aelembytes(to_elem_bt);
5305   assert(to_elem_sz <= 4, "");
5306   vcvttps2dq(dst, src, vec_enc);
5307   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5308   switch(to_elem_bt) {
5309     case T_INT:
5310       break;
5311     case T_SHORT:
5312       evpmovdw(dst, dst, vec_enc);
5313       break;
5314     case T_BYTE:
5315       evpmovdb(dst, dst, vec_enc);
5316       break;
5317     default: assert(false, "%s", type2name(to_elem_bt));
5318   }
5319 }
5320 
5321 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5322                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5323                                             Register rscratch, int vec_enc) {
5324   evcvttps2qq(dst, src, vec_enc);
5325   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5326 }
5327 
5328 // Handling for downcasting from double to integer or sub-word types on AVX2.
5329 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5330                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5331                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5332   int to_elem_sz = type2aelembytes(to_elem_bt);
5333   assert(to_elem_sz < 8, "");
5334   vcvttpd2dq(dst, src, vec_enc);
5335   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5336                                               float_sign_flip, vec_enc);
5337   if (to_elem_sz < 4) {
5338     // xtmp4 holds all zero lanes.
5339     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5340   }
5341 }
5342 
5343 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5344                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5345                                             KRegister ktmp2, AddressLiteral sign_flip,
5346                                             Register rscratch, int vec_enc) {
5347   if (VM_Version::supports_avx512dq()) {
5348     evcvttpd2qq(dst, src, vec_enc);
5349     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5350     switch(to_elem_bt) {
5351       case T_LONG:
5352         break;
5353       case T_INT:
5354         evpmovsqd(dst, dst, vec_enc);
5355         break;
5356       case T_SHORT:
5357         evpmovsqd(dst, dst, vec_enc);
5358         evpmovdw(dst, dst, vec_enc);
5359         break;
5360       case T_BYTE:
5361         evpmovsqd(dst, dst, vec_enc);
5362         evpmovdb(dst, dst, vec_enc);
5363         break;
5364       default: assert(false, "%s", type2name(to_elem_bt));
5365     }
5366   } else {
5367     assert(type2aelembytes(to_elem_bt) <= 4, "");
5368     vcvttpd2dq(dst, src, vec_enc);
5369     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5370     switch(to_elem_bt) {
5371       case T_INT:
5372         break;
5373       case T_SHORT:
5374         evpmovdw(dst, dst, vec_enc);
5375         break;
5376       case T_BYTE:
5377         evpmovdb(dst, dst, vec_enc);
5378         break;
5379       default: assert(false, "%s", type2name(to_elem_bt));
5380     }
5381   }
5382 }
5383 
5384 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5385                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5386                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5387   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5388   // and re-instantiate original MXCSR.RC mode after that.
5389   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5390 
5391   mov64(tmp, julong_cast(0.5L));
5392   evpbroadcastq(xtmp1, tmp, vec_enc);
5393   vaddpd(xtmp1, src , xtmp1, vec_enc);
5394   evcvtpd2qq(dst, xtmp1, vec_enc);
5395   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5396                                                 double_sign_flip, vec_enc);;
5397 
5398   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5399 }
5400 
5401 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5402                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5403                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5404   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5405   // and re-instantiate original MXCSR.RC mode after that.
5406   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5407 
5408   movl(tmp, jint_cast(0.5));
5409   movq(xtmp1, tmp);
5410   vbroadcastss(xtmp1, xtmp1, vec_enc);
5411   vaddps(xtmp1, src , xtmp1, vec_enc);
5412   vcvtps2dq(dst, xtmp1, vec_enc);
5413   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5414                                               float_sign_flip, vec_enc);
5415 
5416   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5417 }
5418 
5419 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5420                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5421                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5422   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5423   // and re-instantiate original MXCSR.RC mode after that.
5424   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5425 
5426   movl(tmp, jint_cast(0.5));
5427   movq(xtmp1, tmp);
5428   vbroadcastss(xtmp1, xtmp1, vec_enc);
5429   vaddps(xtmp1, src , xtmp1, vec_enc);
5430   vcvtps2dq(dst, xtmp1, vec_enc);
5431   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5432 
5433   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5434 }
5435 
5436 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5437                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5438   switch (from_elem_bt) {
5439     case T_BYTE:
5440       switch (to_elem_bt) {
5441         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5442         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5443         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5444         default: ShouldNotReachHere();
5445       }
5446       break;
5447     case T_SHORT:
5448       switch (to_elem_bt) {
5449         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5450         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5451         default: ShouldNotReachHere();
5452       }
5453       break;
5454     case T_INT:
5455       assert(to_elem_bt == T_LONG, "");
5456       vpmovzxdq(dst, src, vlen_enc);
5457       break;
5458     default:
5459       ShouldNotReachHere();
5460   }
5461 }
5462 
5463 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5464                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5465   switch (from_elem_bt) {
5466     case T_BYTE:
5467       switch (to_elem_bt) {
5468         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5469         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5470         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5471         default: ShouldNotReachHere();
5472       }
5473       break;
5474     case T_SHORT:
5475       switch (to_elem_bt) {
5476         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5477         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5478         default: ShouldNotReachHere();
5479       }
5480       break;
5481     case T_INT:
5482       assert(to_elem_bt == T_LONG, "");
5483       vpmovsxdq(dst, src, vlen_enc);
5484       break;
5485     default:
5486       ShouldNotReachHere();
5487   }
5488 }
5489 
5490 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5491                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5492   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5493   assert(vlen_enc != AVX_512bit, "");
5494 
5495   int dst_bt_size = type2aelembytes(dst_bt);
5496   int src_bt_size = type2aelembytes(src_bt);
5497   if (dst_bt_size > src_bt_size) {
5498     switch (dst_bt_size / src_bt_size) {
5499       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5500       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5501       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5502       default: ShouldNotReachHere();
5503     }
5504   } else {
5505     assert(dst_bt_size < src_bt_size, "");
5506     switch (src_bt_size / dst_bt_size) {
5507       case 2: {
5508         if (vlen_enc == AVX_128bit) {
5509           vpacksswb(dst, src, src, vlen_enc);
5510         } else {
5511           vpacksswb(dst, src, src, vlen_enc);
5512           vpermq(dst, dst, 0x08, vlen_enc);
5513         }
5514         break;
5515       }
5516       case 4: {
5517         if (vlen_enc == AVX_128bit) {
5518           vpackssdw(dst, src, src, vlen_enc);
5519           vpacksswb(dst, dst, dst, vlen_enc);
5520         } else {
5521           vpackssdw(dst, src, src, vlen_enc);
5522           vpermq(dst, dst, 0x08, vlen_enc);
5523           vpacksswb(dst, dst, dst, AVX_128bit);
5524         }
5525         break;
5526       }
5527       case 8: {
5528         if (vlen_enc == AVX_128bit) {
5529           vpshufd(dst, src, 0x08, vlen_enc);
5530           vpackssdw(dst, dst, dst, vlen_enc);
5531           vpacksswb(dst, dst, dst, vlen_enc);
5532         } else {
5533           vpshufd(dst, src, 0x08, vlen_enc);
5534           vpermq(dst, dst, 0x08, vlen_enc);
5535           vpackssdw(dst, dst, dst, AVX_128bit);
5536           vpacksswb(dst, dst, dst, AVX_128bit);
5537         }
5538         break;
5539       }
5540       default: ShouldNotReachHere();
5541     }
5542   }
5543 }
5544 
5545 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5546                                    bool merge, BasicType bt, int vlen_enc) {
5547   if (bt == T_INT) {
5548     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5549   } else {
5550     assert(bt == T_LONG, "");
5551     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5552   }
5553 }
5554 
5555 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5556                                    bool merge, BasicType bt, int vlen_enc) {
5557   if (bt == T_INT) {
5558     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5559   } else {
5560     assert(bt == T_LONG, "");
5561     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5562   }
5563 }
5564 
5565 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5566                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5567                                                int vec_enc) {
5568   int index = 0;
5569   int vindex = 0;
5570   mov64(rtmp1, 0x0101010101010101L);
5571   pdepq(rtmp1, src, rtmp1);
5572   if (mask_len > 8) {
5573     movq(rtmp2, src);
5574     vpxor(xtmp, xtmp, xtmp, vec_enc);
5575     movq(xtmp, rtmp1);
5576   }
5577   movq(dst, rtmp1);
5578 
5579   mask_len -= 8;
5580   while (mask_len > 0) {
5581     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5582     index++;
5583     if ((index % 2) == 0) {
5584       pxor(xtmp, xtmp);
5585     }
5586     mov64(rtmp1, 0x0101010101010101L);
5587     shrq(rtmp2, 8);
5588     pdepq(rtmp1, rtmp2, rtmp1);
5589     pinsrq(xtmp, rtmp1, index % 2);
5590     vindex = index / 2;
5591     if (vindex) {
5592       // Write entire 16 byte vector when both 64 bit
5593       // lanes are update to save redundant instructions.
5594       if (index % 2) {
5595         vinsertf128(dst, dst, xtmp, vindex);
5596       }
5597     } else {
5598       vmovdqu(dst, xtmp);
5599     }
5600     mask_len -= 8;
5601   }
5602 }
5603 
5604 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5605   switch(opc) {
5606     case Op_VectorMaskTrueCount:
5607       popcntq(dst, tmp);
5608       break;
5609     case Op_VectorMaskLastTrue:
5610       if (VM_Version::supports_lzcnt()) {
5611         lzcntq(tmp, tmp);
5612         movl(dst, 63);
5613         subl(dst, tmp);
5614       } else {
5615         movl(dst, -1);
5616         bsrq(tmp, tmp);
5617         cmov32(Assembler::notZero, dst, tmp);
5618       }
5619       break;
5620     case Op_VectorMaskFirstTrue:
5621       if (VM_Version::supports_bmi1()) {
5622         if (masklen < 32) {
5623           orl(tmp, 1 << masklen);
5624           tzcntl(dst, tmp);
5625         } else if (masklen == 32) {
5626           tzcntl(dst, tmp);
5627         } else {
5628           assert(masklen == 64, "");
5629           tzcntq(dst, tmp);
5630         }
5631       } else {
5632         if (masklen < 32) {
5633           orl(tmp, 1 << masklen);
5634           bsfl(dst, tmp);
5635         } else {
5636           assert(masklen == 32 || masklen == 64, "");
5637           movl(dst, masklen);
5638           if (masklen == 32)  {
5639             bsfl(tmp, tmp);
5640           } else {
5641             bsfq(tmp, tmp);
5642           }
5643           cmov32(Assembler::notZero, dst, tmp);
5644         }
5645       }
5646       break;
5647     case Op_VectorMaskToLong:
5648       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5649       break;
5650     default: assert(false, "Unhandled mask operation");
5651   }
5652 }
5653 
5654 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5655                                               int masklen, int masksize, int vec_enc) {
5656   assert(VM_Version::supports_popcnt(), "");
5657 
5658   if(VM_Version::supports_avx512bw()) {
5659     kmovql(tmp, mask);
5660   } else {
5661     assert(masklen <= 16, "");
5662     kmovwl(tmp, mask);
5663   }
5664 
5665   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5666   // operations needs to be clipped.
5667   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5668     andq(tmp, (1 << masklen) - 1);
5669   }
5670 
5671   vector_mask_operation_helper(opc, dst, tmp, masklen);
5672 }
5673 
5674 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5675                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5676   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5677          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5678   assert(VM_Version::supports_popcnt(), "");
5679 
5680   bool need_clip = false;
5681   switch(bt) {
5682     case T_BOOLEAN:
5683       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5684       vpxor(xtmp, xtmp, xtmp, vec_enc);
5685       vpsubb(xtmp, xtmp, mask, vec_enc);
5686       vpmovmskb(tmp, xtmp, vec_enc);
5687       need_clip = masklen < 16;
5688       break;
5689     case T_BYTE:
5690       vpmovmskb(tmp, mask, vec_enc);
5691       need_clip = masklen < 16;
5692       break;
5693     case T_SHORT:
5694       vpacksswb(xtmp, mask, mask, vec_enc);
5695       if (masklen >= 16) {
5696         vpermpd(xtmp, xtmp, 8, vec_enc);
5697       }
5698       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5699       need_clip = masklen < 16;
5700       break;
5701     case T_INT:
5702     case T_FLOAT:
5703       vmovmskps(tmp, mask, vec_enc);
5704       need_clip = masklen < 4;
5705       break;
5706     case T_LONG:
5707     case T_DOUBLE:
5708       vmovmskpd(tmp, mask, vec_enc);
5709       need_clip = masklen < 2;
5710       break;
5711     default: assert(false, "Unhandled type, %s", type2name(bt));
5712   }
5713 
5714   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5715   // operations needs to be clipped.
5716   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5717     // need_clip implies masklen < 32
5718     andq(tmp, (1 << masklen) - 1);
5719   }
5720 
5721   vector_mask_operation_helper(opc, dst, tmp, masklen);
5722 }
5723 
5724 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5725                                              Register rtmp2, int mask_len) {
5726   kmov(rtmp1, src);
5727   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5728   mov64(rtmp2, -1L);
5729   pextq(rtmp2, rtmp2, rtmp1);
5730   kmov(dst, rtmp2);
5731 }
5732 
5733 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5734                                                     XMMRegister mask, Register rtmp, Register rscratch,
5735                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5736                                                     int vec_enc) {
5737   assert(type2aelembytes(bt) >= 4, "");
5738   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5739   address compress_perm_table = nullptr;
5740   address expand_perm_table = nullptr;
5741   if (type2aelembytes(bt) == 8) {
5742     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5743     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5744     vmovmskpd(rtmp, mask, vec_enc);
5745   } else {
5746     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5747     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5748     vmovmskps(rtmp, mask, vec_enc);
5749   }
5750   shlq(rtmp, 5); // for 32 byte permute row.
5751   if (opcode == Op_CompressV) {
5752     lea(rscratch, ExternalAddress(compress_perm_table));
5753   } else {
5754     lea(rscratch, ExternalAddress(expand_perm_table));
5755   }
5756   addptr(rtmp, rscratch);
5757   vmovdqu(permv, Address(rtmp));
5758   vpermps(dst, permv, src, Assembler::AVX_256bit);
5759   vpxor(xtmp, xtmp, xtmp, vec_enc);
5760   // Blend the result with zero vector using permute mask, each column entry
5761   // in a permute table row contains either a valid permute index or a -1 (default)
5762   // value, this can potentially be used as a blending mask after
5763   // compressing/expanding the source vector lanes.
5764   vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv);
5765 }
5766 
5767 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5768                                                bool merge, BasicType bt, int vec_enc) {
5769   if (opcode == Op_CompressV) {
5770     switch(bt) {
5771     case T_BYTE:
5772       evpcompressb(dst, mask, src, merge, vec_enc);
5773       break;
5774     case T_CHAR:
5775     case T_SHORT:
5776       evpcompressw(dst, mask, src, merge, vec_enc);
5777       break;
5778     case T_INT:
5779       evpcompressd(dst, mask, src, merge, vec_enc);
5780       break;
5781     case T_FLOAT:
5782       evcompressps(dst, mask, src, merge, vec_enc);
5783       break;
5784     case T_LONG:
5785       evpcompressq(dst, mask, src, merge, vec_enc);
5786       break;
5787     case T_DOUBLE:
5788       evcompresspd(dst, mask, src, merge, vec_enc);
5789       break;
5790     default:
5791       fatal("Unsupported type %s", type2name(bt));
5792       break;
5793     }
5794   } else {
5795     assert(opcode == Op_ExpandV, "");
5796     switch(bt) {
5797     case T_BYTE:
5798       evpexpandb(dst, mask, src, merge, vec_enc);
5799       break;
5800     case T_CHAR:
5801     case T_SHORT:
5802       evpexpandw(dst, mask, src, merge, vec_enc);
5803       break;
5804     case T_INT:
5805       evpexpandd(dst, mask, src, merge, vec_enc);
5806       break;
5807     case T_FLOAT:
5808       evexpandps(dst, mask, src, merge, vec_enc);
5809       break;
5810     case T_LONG:
5811       evpexpandq(dst, mask, src, merge, vec_enc);
5812       break;
5813     case T_DOUBLE:
5814       evexpandpd(dst, mask, src, merge, vec_enc);
5815       break;
5816     default:
5817       fatal("Unsupported type %s", type2name(bt));
5818       break;
5819     }
5820   }
5821 }
5822 
5823 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5824                                            KRegister ktmp1, int vec_enc) {
5825   if (opcode == Op_SignumVD) {
5826     vsubpd(dst, zero, one, vec_enc);
5827     // if src < 0 ? -1 : 1
5828     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5829     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5830     // if src == NaN, -0.0 or 0.0 return src.
5831     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5832     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5833   } else {
5834     assert(opcode == Op_SignumVF, "");
5835     vsubps(dst, zero, one, vec_enc);
5836     // if src < 0 ? -1 : 1
5837     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5838     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5839     // if src == NaN, -0.0 or 0.0 return src.
5840     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5841     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5842   }
5843 }
5844 
5845 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5846                                           XMMRegister xtmp1, int vec_enc) {
5847   if (opcode == Op_SignumVD) {
5848     vsubpd(dst, zero, one, vec_enc);
5849     // if src < 0 ? -1 : 1
5850     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5851     // if src == NaN, -0.0 or 0.0 return src.
5852     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5853     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5854   } else {
5855     assert(opcode == Op_SignumVF, "");
5856     vsubps(dst, zero, one, vec_enc);
5857     // if src < 0 ? -1 : 1
5858     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5859     // if src == NaN, -0.0 or 0.0 return src.
5860     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5861     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5862   }
5863 }
5864 
5865 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5866   if (VM_Version::supports_avx512bw()) {
5867     if (mask_len > 32) {
5868       kmovql(dst, src);
5869     } else {
5870       kmovdl(dst, src);
5871       if (mask_len != 32) {
5872         kshiftrdl(dst, dst, 32 - mask_len);
5873       }
5874     }
5875   } else {
5876     assert(mask_len <= 16, "");
5877     kmovwl(dst, src);
5878     if (mask_len != 16) {
5879       kshiftrwl(dst, dst, 16 - mask_len);
5880     }
5881   }
5882 }
5883 
5884 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5885   int lane_size = type2aelembytes(bt);
5886   if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5887       (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) {
5888     movptr(rtmp, imm32);
5889     switch(lane_size) {
5890       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5891       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5892       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5893       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5894       fatal("Unsupported lane size %d", lane_size);
5895       break;
5896     }
5897   } else {
5898     movptr(rtmp, imm32);
5899     movq(dst, rtmp);
5900     switch(lane_size) {
5901       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5902       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5903       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5904       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5905       fatal("Unsupported lane size %d", lane_size);
5906       break;
5907     }
5908   }
5909 }
5910 
5911 //
5912 // Following is lookup table based popcount computation algorithm:-
5913 //       Index   Bit set count
5914 //     [ 0000 ->   0,
5915 //       0001 ->   1,
5916 //       0010 ->   1,
5917 //       0011 ->   2,
5918 //       0100 ->   1,
5919 //       0101 ->   2,
5920 //       0110 ->   2,
5921 //       0111 ->   3,
5922 //       1000 ->   1,
5923 //       1001 ->   2,
5924 //       1010 ->   3,
5925 //       1011 ->   3,
5926 //       1100 ->   2,
5927 //       1101 ->   3,
5928 //       1111 ->   4 ]
5929 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5930 //     shuffle indices for lookup table access.
5931 //  b. Right shift each byte of vector lane by 4 positions.
5932 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5933 //     shuffle indices for lookup table access.
5934 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5935 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5936 //     count of all the bytes of a quadword.
5937 //  f. Perform step e. for upper 128bit vector lane.
5938 //  g. Pack the bitset count of quadwords back to double word.
5939 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5940 
5941 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5942                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5943   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5944   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5945   vpsrlw(dst, src, 4, vec_enc);
5946   vpand(dst, dst, xtmp1, vec_enc);
5947   vpand(xtmp1, src, xtmp1, vec_enc);
5948   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5949   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5950   vpshufb(dst, xtmp2, dst, vec_enc);
5951   vpaddb(dst, dst, xtmp1, vec_enc);
5952 }
5953 
5954 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5955                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5956   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5957   // Following code is as per steps e,f,g and h of above algorithm.
5958   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5959   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5960   vpsadbw(dst, dst, xtmp2, vec_enc);
5961   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5962   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5963   vpackuswb(dst, xtmp1, dst, vec_enc);
5964 }
5965 
5966 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5967                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5968   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5969   // Add the popcount of upper and lower bytes of word.
5970   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5971   vpsrlw(dst, xtmp1, 8, vec_enc);
5972   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5973   vpaddw(dst, dst, xtmp1, vec_enc);
5974 }
5975 
5976 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5977                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5978   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5979   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5980   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5981 }
5982 
5983 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5984                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5985   switch(bt) {
5986     case T_LONG:
5987       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5988       break;
5989     case T_INT:
5990       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5991       break;
5992     case T_CHAR:
5993     case T_SHORT:
5994       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5995       break;
5996     case T_BYTE:
5997     case T_BOOLEAN:
5998       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5999       break;
6000     default:
6001       fatal("Unsupported type %s", type2name(bt));
6002       break;
6003   }
6004 }
6005 
6006 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6007                                                       KRegister mask, bool merge, int vec_enc) {
6008   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6009   switch(bt) {
6010     case T_LONG:
6011       assert(VM_Version::supports_avx512_vpopcntdq(), "");
6012       evpopcntq(dst, mask, src, merge, vec_enc);
6013       break;
6014     case T_INT:
6015       assert(VM_Version::supports_avx512_vpopcntdq(), "");
6016       evpopcntd(dst, mask, src, merge, vec_enc);
6017       break;
6018     case T_CHAR:
6019     case T_SHORT:
6020       assert(VM_Version::supports_avx512_bitalg(), "");
6021       evpopcntw(dst, mask, src, merge, vec_enc);
6022       break;
6023     case T_BYTE:
6024     case T_BOOLEAN:
6025       assert(VM_Version::supports_avx512_bitalg(), "");
6026       evpopcntb(dst, mask, src, merge, vec_enc);
6027       break;
6028     default:
6029       fatal("Unsupported type %s", type2name(bt));
6030       break;
6031   }
6032 }
6033 
6034 // Bit reversal algorithm first reverses the bits of each byte followed by
6035 // a byte level reversal for multi-byte primitive types (short/int/long).
6036 // Algorithm performs a lookup table access to get reverse bit sequence
6037 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
6038 // is obtained by swapping the reverse bit sequences of upper and lower
6039 // nibble of a byte.
6040 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6041                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
6042   if (VM_Version::supports_avx512vlbw()) {
6043 
6044     // Get the reverse bit sequence of lower nibble of each byte.
6045     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
6046     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6047     evpandq(dst, xtmp2, src, vec_enc);
6048     vpshufb(dst, xtmp1, dst, vec_enc);
6049     vpsllq(dst, dst, 4, vec_enc);
6050 
6051     // Get the reverse bit sequence of upper nibble of each byte.
6052     vpandn(xtmp2, xtmp2, src, vec_enc);
6053     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6054     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6055 
6056     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6057     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6058     evporq(xtmp2, dst, xtmp2, vec_enc);
6059     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6060 
6061   } else if(vec_enc == Assembler::AVX_512bit) {
6062     // Shift based bit reversal.
6063     assert(bt == T_LONG || bt == T_INT, "");
6064 
6065     // Swap lower and upper nibble of each byte.
6066     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
6067 
6068     // Swap two least and most significant bits of each nibble.
6069     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
6070 
6071     // Swap adjacent pair of bits.
6072     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6073     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
6074 
6075     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6076     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
6077   } else {
6078     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
6079     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6080 
6081     // Get the reverse bit sequence of lower nibble of each byte.
6082     vpand(dst, xtmp2, src, vec_enc);
6083     vpshufb(dst, xtmp1, dst, vec_enc);
6084     vpsllq(dst, dst, 4, vec_enc);
6085 
6086     // Get the reverse bit sequence of upper nibble of each byte.
6087     vpandn(xtmp2, xtmp2, src, vec_enc);
6088     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6089     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6090 
6091     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6092     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6093     vpor(xtmp2, dst, xtmp2, vec_enc);
6094     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6095   }
6096 }
6097 
6098 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
6099                                                 XMMRegister xtmp, Register rscratch) {
6100   assert(VM_Version::supports_gfni(), "");
6101   assert(rscratch != noreg || always_reachable(mask), "missing");
6102 
6103   // Galois field instruction based bit reversal based on following algorithm.
6104   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6105   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
6106   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
6107   vector_reverse_byte(bt, dst, xtmp, vec_enc);
6108 }
6109 
6110 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
6111                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
6112   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
6113   evpandq(dst, xtmp1, src, vec_enc);
6114   vpsllq(dst, dst, nbits, vec_enc);
6115   vpandn(xtmp1, xtmp1, src, vec_enc);
6116   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
6117   evporq(dst, dst, xtmp1, vec_enc);
6118 }
6119 
6120 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6121                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
6122   // Shift based bit reversal.
6123   assert(VM_Version::supports_evex(), "");
6124   switch(bt) {
6125     case T_LONG:
6126       // Swap upper and lower double word of each quad word.
6127       evprorq(xtmp1, k0, src, 32, true, vec_enc);
6128       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
6129       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6130       break;
6131     case T_INT:
6132       // Swap upper and lower word of each double word.
6133       evprord(xtmp1, k0, src, 16, true, vec_enc);
6134       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6135       break;
6136     case T_CHAR:
6137     case T_SHORT:
6138       // Swap upper and lower byte of each word.
6139       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6140       break;
6141     case T_BYTE:
6142       evmovdquq(dst, k0, src, true, vec_enc);
6143       break;
6144     default:
6145       fatal("Unsupported type %s", type2name(bt));
6146       break;
6147   }
6148 }
6149 
6150 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6151   if (bt == T_BYTE) {
6152     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6153       evmovdquq(dst, k0, src, true, vec_enc);
6154     } else {
6155       vmovdqu(dst, src);
6156     }
6157     return;
6158   }
6159   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6160   // pre-computed shuffle indices.
6161   switch(bt) {
6162     case T_LONG:
6163       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6164       break;
6165     case T_INT:
6166       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6167       break;
6168     case T_CHAR:
6169     case T_SHORT:
6170       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6171       break;
6172     default:
6173       fatal("Unsupported type %s", type2name(bt));
6174       break;
6175   }
6176   vpshufb(dst, src, dst, vec_enc);
6177 }
6178 
6179 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6180                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6181                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6182   assert(is_integral_type(bt), "");
6183   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6184   assert(VM_Version::supports_avx512cd(), "");
6185   switch(bt) {
6186     case T_LONG:
6187       evplzcntq(dst, ktmp, src, merge, vec_enc);
6188       break;
6189     case T_INT:
6190       evplzcntd(dst, ktmp, src, merge, vec_enc);
6191       break;
6192     case T_SHORT:
6193       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6194       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6195       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6196       vpunpckhwd(dst, xtmp1, src, vec_enc);
6197       evplzcntd(dst, ktmp, dst, merge, vec_enc);
6198       vpackusdw(dst, xtmp2, dst, vec_enc);
6199       break;
6200     case T_BYTE:
6201       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6202       // accessing the lookup table.
6203       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6204       // accessing the lookup table.
6205       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6206       assert(VM_Version::supports_avx512bw(), "");
6207       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6208       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6209       vpand(xtmp2, dst, src, vec_enc);
6210       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6211       vpsrlw(xtmp3, src, 4, vec_enc);
6212       vpand(xtmp3, dst, xtmp3, vec_enc);
6213       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6214       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6215       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6216       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6217       break;
6218     default:
6219       fatal("Unsupported type %s", type2name(bt));
6220       break;
6221   }
6222 }
6223 
6224 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6225                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6226   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6227   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6228   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6229   // accessing the lookup table.
6230   vpand(dst, xtmp2, src, vec_enc);
6231   vpshufb(dst, xtmp1, dst, vec_enc);
6232   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6233   // accessing the lookup table.
6234   vpsrlw(xtmp3, src, 4, vec_enc);
6235   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6236   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6237   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6238   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6239   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6240   vpaddb(dst, dst, xtmp2, vec_enc);
6241   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6242 }
6243 
6244 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6245                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6246   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6247   // Add zero counts of lower byte and upper byte of a word if
6248   // upper byte holds a zero value.
6249   vpsrlw(xtmp3, src, 8, vec_enc);
6250   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6251   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6252   vpsllw(xtmp2, dst, 8, vec_enc);
6253   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6254   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6255   vpsrlw(dst, dst, 8, vec_enc);
6256 }
6257 
6258 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6259                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6260   // Since IEEE 754 floating point format represents mantissa in 1.0 format
6261   // hence biased exponent can be used to compute leading zero count as per
6262   // following formula:-
6263   // LZCNT = 31 - (biased_exp - 127)
6264   // Special handling has been introduced for Zero, Max_Int and -ve source values.
6265 
6266   // Broadcast 0xFF
6267   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6268   vpsrld(xtmp1, xtmp1, 24, vec_enc);
6269 
6270   // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher
6271   // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit
6272   // contributes to the leading number of zeros.
6273   vpsrld(xtmp2, src, 1, vec_enc);
6274   vpandn(xtmp3, xtmp2, src, vec_enc);
6275 
6276   // Extract biased exponent.
6277   vcvtdq2ps(dst, xtmp3, vec_enc);
6278   vpsrld(dst, dst, 23, vec_enc);
6279   vpand(dst, dst, xtmp1, vec_enc);
6280 
6281   // Broadcast 127.
6282   vpsrld(xtmp1, xtmp1, 1, vec_enc);
6283   // Exponent = biased_exp - 127
6284   vpsubd(dst, dst, xtmp1, vec_enc);
6285 
6286   // Exponent_plus_one = Exponent + 1
6287   vpsrld(xtmp3, xtmp1, 6, vec_enc);
6288   vpaddd(dst, dst, xtmp3, vec_enc);
6289 
6290   // Replace -ve exponent with zero, exponent is -ve when src
6291   // lane contains a zero value.
6292   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6293   vblendvps(dst, dst, xtmp2, dst, vec_enc);
6294 
6295   // Rematerialize broadcast 32.
6296   vpslld(xtmp1, xtmp3, 5, vec_enc);
6297   // Exponent is 32 if corresponding source lane contains max_int value.
6298   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
6299   // LZCNT = 32 - exponent_plus_one
6300   vpsubd(dst, xtmp1, dst, vec_enc);
6301 
6302   // Replace LZCNT with a value 1 if corresponding source lane
6303   // contains max_int value.
6304   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
6305 
6306   // Replace biased_exp with 0 if source lane value is less than zero.
6307   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6308   vblendvps(dst, dst, xtmp2, src, vec_enc);
6309 }
6310 
6311 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6312                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6313   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6314   // Add zero counts of lower word and upper word of a double word if
6315   // upper word holds a zero value.
6316   vpsrld(xtmp3, src, 16, vec_enc);
6317   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6318   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
6319   vpslld(xtmp2, dst, 16, vec_enc);
6320   vpaddd(xtmp2, xtmp2, dst, vec_enc);
6321   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6322   vpsrld(dst, dst, 16, vec_enc);
6323   // Add zero counts of lower doubleword and upper doubleword of a
6324   // quadword if upper doubleword holds a zero value.
6325   vpsrlq(xtmp3, src, 32, vec_enc);
6326   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
6327   vpsllq(xtmp2, dst, 32, vec_enc);
6328   vpaddq(xtmp2, xtmp2, dst, vec_enc);
6329   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6330   vpsrlq(dst, dst, 32, vec_enc);
6331 }
6332 
6333 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6334                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6335                                                        Register rtmp, int vec_enc) {
6336   assert(is_integral_type(bt), "unexpected type");
6337   assert(vec_enc < Assembler::AVX_512bit, "");
6338   switch(bt) {
6339     case T_LONG:
6340       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6341       break;
6342     case T_INT:
6343       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6344       break;
6345     case T_SHORT:
6346       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6347       break;
6348     case T_BYTE:
6349       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6350       break;
6351     default:
6352       fatal("Unsupported type %s", type2name(bt));
6353       break;
6354   }
6355 }
6356 
6357 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6358   switch(bt) {
6359     case T_BYTE:
6360       vpsubb(dst, src1, src2, vec_enc);
6361       break;
6362     case T_SHORT:
6363       vpsubw(dst, src1, src2, vec_enc);
6364       break;
6365     case T_INT:
6366       vpsubd(dst, src1, src2, vec_enc);
6367       break;
6368     case T_LONG:
6369       vpsubq(dst, src1, src2, vec_enc);
6370       break;
6371     default:
6372       fatal("Unsupported type %s", type2name(bt));
6373       break;
6374   }
6375 }
6376 
6377 // Trailing zero count computation is based on leading zero count operation as per
6378 // following equation. All AVX3 targets support AVX512CD feature which offers
6379 // direct vector instruction to compute leading zero count.
6380 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6381 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6382                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6383                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6384   assert(is_integral_type(bt), "");
6385   // xtmp = -1
6386   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6387   // xtmp = xtmp + src
6388   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6389   // xtmp = xtmp & ~src
6390   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6391   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6392   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6393   vpsub(bt, dst, xtmp4, dst, vec_enc);
6394 }
6395 
6396 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6397 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6398 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6399                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6400   assert(is_integral_type(bt), "");
6401   // xtmp = 0
6402   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6403   // xtmp = 0 - src
6404   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6405   // xtmp = xtmp | src
6406   vpor(xtmp3, xtmp3, src, vec_enc);
6407   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6408   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6409   vpsub(bt, dst, xtmp1, dst, vec_enc);
6410 }
6411 
6412 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6413   Label done;
6414   Label neg_divisor_fastpath;
6415   cmpl(divisor, 0);
6416   jccb(Assembler::less, neg_divisor_fastpath);
6417   xorl(rdx, rdx);
6418   divl(divisor);
6419   jmpb(done);
6420   bind(neg_divisor_fastpath);
6421   // Fastpath for divisor < 0:
6422   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6423   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6424   movl(rdx, rax);
6425   subl(rdx, divisor);
6426   if (VM_Version::supports_bmi1()) {
6427     andnl(rax, rdx, rax);
6428   } else {
6429     notl(rdx);
6430     andl(rax, rdx);
6431   }
6432   shrl(rax, 31);
6433   bind(done);
6434 }
6435 
6436 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6437   Label done;
6438   Label neg_divisor_fastpath;
6439   cmpl(divisor, 0);
6440   jccb(Assembler::less, neg_divisor_fastpath);
6441   xorl(rdx, rdx);
6442   divl(divisor);
6443   jmpb(done);
6444   bind(neg_divisor_fastpath);
6445   // Fastpath when divisor < 0:
6446   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6447   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6448   movl(rdx, rax);
6449   subl(rax, divisor);
6450   if (VM_Version::supports_bmi1()) {
6451     andnl(rax, rax, rdx);
6452   } else {
6453     notl(rax);
6454     andl(rax, rdx);
6455   }
6456   sarl(rax, 31);
6457   andl(rax, divisor);
6458   subl(rdx, rax);
6459   bind(done);
6460 }
6461 
6462 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6463   Label done;
6464   Label neg_divisor_fastpath;
6465 
6466   cmpl(divisor, 0);
6467   jccb(Assembler::less, neg_divisor_fastpath);
6468   xorl(rdx, rdx);
6469   divl(divisor);
6470   jmpb(done);
6471   bind(neg_divisor_fastpath);
6472   // Fastpath for divisor < 0:
6473   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6474   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6475   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6476   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6477   movl(rdx, rax);
6478   subl(rax, divisor);
6479   if (VM_Version::supports_bmi1()) {
6480     andnl(rax, rax, rdx);
6481   } else {
6482     notl(rax);
6483     andl(rax, rdx);
6484   }
6485   movl(tmp, rax);
6486   shrl(rax, 31); // quotient
6487   sarl(tmp, 31);
6488   andl(tmp, divisor);
6489   subl(rdx, tmp); // remainder
6490   bind(done);
6491 }
6492 
6493 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6494                                  XMMRegister xtmp2, Register rtmp) {
6495   if(VM_Version::supports_gfni()) {
6496     // Galois field instruction based bit reversal based on following algorithm.
6497     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6498     mov64(rtmp, 0x8040201008040201L);
6499     movq(xtmp1, src);
6500     movq(xtmp2, rtmp);
6501     gf2p8affineqb(xtmp1, xtmp2, 0);
6502     movq(dst, xtmp1);
6503   } else {
6504     // Swap even and odd numbered bits.
6505     movl(rtmp, src);
6506     andl(rtmp, 0x55555555);
6507     shll(rtmp, 1);
6508     movl(dst, src);
6509     andl(dst, 0xAAAAAAAA);
6510     shrl(dst, 1);
6511     orl(dst, rtmp);
6512 
6513     // Swap LSB and MSB 2 bits of each nibble.
6514     movl(rtmp, dst);
6515     andl(rtmp, 0x33333333);
6516     shll(rtmp, 2);
6517     andl(dst, 0xCCCCCCCC);
6518     shrl(dst, 2);
6519     orl(dst, rtmp);
6520 
6521     // Swap LSB and MSB 4 bits of each byte.
6522     movl(rtmp, dst);
6523     andl(rtmp, 0x0F0F0F0F);
6524     shll(rtmp, 4);
6525     andl(dst, 0xF0F0F0F0);
6526     shrl(dst, 4);
6527     orl(dst, rtmp);
6528   }
6529   bswapl(dst);
6530 }
6531 
6532 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6533                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6534   if(VM_Version::supports_gfni()) {
6535     // Galois field instruction based bit reversal based on following algorithm.
6536     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6537     mov64(rtmp1, 0x8040201008040201L);
6538     movq(xtmp1, src);
6539     movq(xtmp2, rtmp1);
6540     gf2p8affineqb(xtmp1, xtmp2, 0);
6541     movq(dst, xtmp1);
6542   } else {
6543     // Swap even and odd numbered bits.
6544     movq(rtmp1, src);
6545     mov64(rtmp2, 0x5555555555555555L);
6546     andq(rtmp1, rtmp2);
6547     shlq(rtmp1, 1);
6548     movq(dst, src);
6549     notq(rtmp2);
6550     andq(dst, rtmp2);
6551     shrq(dst, 1);
6552     orq(dst, rtmp1);
6553 
6554     // Swap LSB and MSB 2 bits of each nibble.
6555     movq(rtmp1, dst);
6556     mov64(rtmp2, 0x3333333333333333L);
6557     andq(rtmp1, rtmp2);
6558     shlq(rtmp1, 2);
6559     notq(rtmp2);
6560     andq(dst, rtmp2);
6561     shrq(dst, 2);
6562     orq(dst, rtmp1);
6563 
6564     // Swap LSB and MSB 4 bits of each byte.
6565     movq(rtmp1, dst);
6566     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6567     andq(rtmp1, rtmp2);
6568     shlq(rtmp1, 4);
6569     notq(rtmp2);
6570     andq(dst, rtmp2);
6571     shrq(dst, 4);
6572     orq(dst, rtmp1);
6573   }
6574   bswapq(dst);
6575 }
6576 
6577 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6578   Label done;
6579   Label neg_divisor_fastpath;
6580   cmpq(divisor, 0);
6581   jccb(Assembler::less, neg_divisor_fastpath);
6582   xorl(rdx, rdx);
6583   divq(divisor);
6584   jmpb(done);
6585   bind(neg_divisor_fastpath);
6586   // Fastpath for divisor < 0:
6587   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6588   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6589   movq(rdx, rax);
6590   subq(rdx, divisor);
6591   if (VM_Version::supports_bmi1()) {
6592     andnq(rax, rdx, rax);
6593   } else {
6594     notq(rdx);
6595     andq(rax, rdx);
6596   }
6597   shrq(rax, 63);
6598   bind(done);
6599 }
6600 
6601 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6602   Label done;
6603   Label neg_divisor_fastpath;
6604   cmpq(divisor, 0);
6605   jccb(Assembler::less, neg_divisor_fastpath);
6606   xorq(rdx, rdx);
6607   divq(divisor);
6608   jmp(done);
6609   bind(neg_divisor_fastpath);
6610   // Fastpath when divisor < 0:
6611   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6612   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6613   movq(rdx, rax);
6614   subq(rax, divisor);
6615   if (VM_Version::supports_bmi1()) {
6616     andnq(rax, rax, rdx);
6617   } else {
6618     notq(rax);
6619     andq(rax, rdx);
6620   }
6621   sarq(rax, 63);
6622   andq(rax, divisor);
6623   subq(rdx, rax);
6624   bind(done);
6625 }
6626 
6627 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6628   Label done;
6629   Label neg_divisor_fastpath;
6630   cmpq(divisor, 0);
6631   jccb(Assembler::less, neg_divisor_fastpath);
6632   xorq(rdx, rdx);
6633   divq(divisor);
6634   jmp(done);
6635   bind(neg_divisor_fastpath);
6636   // Fastpath for divisor < 0:
6637   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6638   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6639   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6640   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6641   movq(rdx, rax);
6642   subq(rax, divisor);
6643   if (VM_Version::supports_bmi1()) {
6644     andnq(rax, rax, rdx);
6645   } else {
6646     notq(rax);
6647     andq(rax, rdx);
6648   }
6649   movq(tmp, rax);
6650   shrq(rax, 63); // quotient
6651   sarq(tmp, 63);
6652   andq(tmp, divisor);
6653   subq(rdx, tmp); // remainder
6654   bind(done);
6655 }
6656 
6657 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6658                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6659                                         int vlen_enc) {
6660   assert(VM_Version::supports_avx512bw(), "");
6661   // Byte shuffles are inlane operations and indices are determined using
6662   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6663   // normalized to index range 0-15. This makes sure that all the multiples
6664   // of an index value are placed at same relative position in 128 bit
6665   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6666   // will be 16th element in their respective 128 bit lanes.
6667   movl(rtmp, 16);
6668   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6669 
6670   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6671   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6672   // original shuffle indices and move the shuffled lanes corresponding to true
6673   // mask to destination vector.
6674   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6675   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6676   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6677 
6678   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6679   // and broadcasting second 128 bit lane.
6680   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6681   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6682   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6683   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6684   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6685 
6686   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6687   // and broadcasting third 128 bit lane.
6688   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6689   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6690   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6691   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6692   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6693 
6694   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6695   // and broadcasting third 128 bit lane.
6696   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6697   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6698   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6699   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6700   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6701 }
6702 
6703 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6704                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6705   if (vlen_enc == AVX_128bit) {
6706     vpermilps(dst, src, shuffle, vlen_enc);
6707   } else if (bt == T_INT) {
6708     vpermd(dst, shuffle, src, vlen_enc);
6709   } else {
6710     assert(bt == T_FLOAT, "");
6711     vpermps(dst, shuffle, src, vlen_enc);
6712   }
6713 }
6714 
6715 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6716   switch(opcode) {
6717     case Op_AddHF: vaddsh(dst, src1, src2); break;
6718     case Op_SubHF: vsubsh(dst, src1, src2); break;
6719     case Op_MulHF: vmulsh(dst, src1, src2); break;
6720     case Op_DivHF: vdivsh(dst, src1, src2); break;
6721     default: assert(false, "%s", NodeClassNames[opcode]); break;
6722   }
6723 }
6724 
6725 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6726   switch(elem_bt) {
6727     case T_BYTE:
6728       if (ideal_opc == Op_SaturatingAddV) {
6729         vpaddsb(dst, src1, src2, vlen_enc);
6730       } else {
6731         assert(ideal_opc == Op_SaturatingSubV, "");
6732         vpsubsb(dst, src1, src2, vlen_enc);
6733       }
6734       break;
6735     case T_SHORT:
6736       if (ideal_opc == Op_SaturatingAddV) {
6737         vpaddsw(dst, src1, src2, vlen_enc);
6738       } else {
6739         assert(ideal_opc == Op_SaturatingSubV, "");
6740         vpsubsw(dst, src1, src2, vlen_enc);
6741       }
6742       break;
6743     default:
6744       fatal("Unsupported type %s", type2name(elem_bt));
6745       break;
6746   }
6747 }
6748 
6749 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6750   switch(elem_bt) {
6751     case T_BYTE:
6752       if (ideal_opc == Op_SaturatingAddV) {
6753         vpaddusb(dst, src1, src2, vlen_enc);
6754       } else {
6755         assert(ideal_opc == Op_SaturatingSubV, "");
6756         vpsubusb(dst, src1, src2, vlen_enc);
6757       }
6758       break;
6759     case T_SHORT:
6760       if (ideal_opc == Op_SaturatingAddV) {
6761         vpaddusw(dst, src1, src2, vlen_enc);
6762       } else {
6763         assert(ideal_opc == Op_SaturatingSubV, "");
6764         vpsubusw(dst, src1, src2, vlen_enc);
6765       }
6766       break;
6767     default:
6768       fatal("Unsupported type %s", type2name(elem_bt));
6769       break;
6770   }
6771 }
6772 
6773 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6774                                                               XMMRegister src2, KRegister ktmp, int vlen_enc) {
6775   // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6776   // overflow_mask = Inp1 <u Inp2
6777   evpcmpu(elem_bt, ktmp,  src2, src1, Assembler::lt, vlen_enc);
6778   // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6779   evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6780 }
6781 
6782 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6783                                                               XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6784   // Emulate unsigned comparison using signed comparison
6785   // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6786   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6787   vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6788   vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6789 
6790   vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6791 
6792   // Res = INP1 - INP2 (non-commutative and non-associative)
6793   vpsub(elem_bt, dst, src1, src2, vlen_enc);
6794   // Res = Mask ? Zero : Res
6795   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6796   vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6797 }
6798 
6799 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6800                                                                XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6801   // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6802   // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6803   // Res = Signed Add INP1, INP2
6804   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6805   // T1 = SRC1 | SRC2
6806   vpor(xtmp1, src1, src2, vlen_enc);
6807   // Max_Unsigned = -1
6808   vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6809   // Unsigned compare:  Mask = Res <u T1
6810   evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6811   // res  = Mask ? Max_Unsigned : Res
6812   evpblend(elem_bt, dst, ktmp,  dst, xtmp2, true, vlen_enc);
6813 }
6814 
6815 //
6816 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6817 // unsigned addition operation.
6818 //    overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6819 //
6820 // We empirically determined its semantic equivalence to following reduced expression
6821 //    overflow_mask =  (a + b) <u (a | b)
6822 //
6823 // and also verified it though Alive2 solver.
6824 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6825 //
6826 
6827 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6828                                                               XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6829   // Res = Signed Add INP1, INP2
6830   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6831   // Compute T1 = INP1 | INP2
6832   vpor(xtmp3, src1, src2, vlen_enc);
6833   // T1 = Minimum signed value.
6834   vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6835   // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6836   vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6837   // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6838   vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6839   // Compute overflow detection mask = Res<1> <s T1
6840   if (elem_bt == T_INT) {
6841     vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6842   } else {
6843     assert(elem_bt == T_LONG, "");
6844     vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6845   }
6846   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6847 }
6848 
6849 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6850                                       int vlen_enc, bool xtmp2_hold_M1) {
6851   if (VM_Version::supports_avx512dq()) {
6852     evpmovq2m(ktmp, src, vlen_enc);
6853   } else {
6854     assert(VM_Version::supports_evex(), "");
6855     if (!xtmp2_hold_M1) {
6856       vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6857     }
6858     evpsraq(xtmp1, src, 63, vlen_enc);
6859     evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6860   }
6861 }
6862 
6863 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6864                                       int vlen_enc, bool xtmp2_hold_M1) {
6865   if (VM_Version::supports_avx512dq()) {
6866     evpmovd2m(ktmp, src, vlen_enc);
6867   } else {
6868     assert(VM_Version::supports_evex(), "");
6869     if (!xtmp2_hold_M1) {
6870       vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6871     }
6872     vpsrad(xtmp1, src, 31, vlen_enc);
6873     Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6874   }
6875 }
6876 
6877 
6878 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6879   if (elem_bt == T_LONG) {
6880     if (VM_Version::supports_evex()) {
6881       evpsraq(dst, src, 63, vlen_enc);
6882     } else {
6883       vpsrad(dst, src, 31, vlen_enc);
6884       vpshufd(dst, dst, 0xF5, vlen_enc);
6885     }
6886   } else {
6887     assert(elem_bt == T_INT, "");
6888     vpsrad(dst, src, 31, vlen_enc);
6889   }
6890 }
6891 
6892 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6893   if (compute_allones) {
6894     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6895       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6896     } else {
6897       vpcmpeqq(allones, allones, allones, vlen_enc);
6898     }
6899   }
6900   if (elem_bt == T_LONG) {
6901     vpsrlq(dst, allones, 1, vlen_enc);
6902   } else {
6903     assert(elem_bt == T_INT, "");
6904     vpsrld(dst, allones, 1, vlen_enc);
6905   }
6906 }
6907 
6908 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6909   if (compute_allones) {
6910     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6911       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6912     } else {
6913       vpcmpeqq(allones, allones, allones, vlen_enc);
6914     }
6915   }
6916   if (elem_bt == T_LONG) {
6917     vpsllq(dst, allones, 63, vlen_enc);
6918   } else {
6919     assert(elem_bt == T_INT, "");
6920     vpslld(dst, allones, 31, vlen_enc);
6921   }
6922 }
6923 
6924 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask,  XMMRegister src1, XMMRegister src2,
6925                                 Assembler::ComparisonPredicate cond, int vlen_enc) {
6926   switch(elem_bt) {
6927     case T_LONG:  evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6928     case T_INT:   evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6929     case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6930     case T_BYTE:  evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6931     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6932   }
6933 }
6934 
6935 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6936   switch(elem_bt) {
6937     case  T_LONG:  vpcmpgtq(dst, src1, src2, vlen_enc); break;
6938     case  T_INT:   vpcmpgtd(dst, src1, src2, vlen_enc); break;
6939     case  T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6940     case  T_BYTE:  vpcmpgtb(dst, src1, src2, vlen_enc); break;
6941     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6942   }
6943 }
6944 
6945 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6946                                            XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6947   if (elem_bt == T_LONG) {
6948     evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6949   } else {
6950     assert(elem_bt == T_INT, "");
6951     evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6952   }
6953 }
6954 
6955 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6956                                                          XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6957                                                          KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6958   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6959   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6960   // Overflow detection based on Hacker's delight section 2-13.
6961   if (ideal_opc == Op_SaturatingAddV) {
6962     // res = src1 + src2
6963     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6964     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6965     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6966     vpxor(xtmp1, dst, src1, vlen_enc);
6967     vpxor(xtmp2, dst, src2, vlen_enc);
6968     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6969   } else {
6970     assert(ideal_opc == Op_SaturatingSubV, "");
6971     // res = src1 - src2
6972     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6973     // Overflow occurs when both inputs have opposite polarity and
6974     // result polarity does not comply with first input polarity.
6975     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6976     vpxor(xtmp1, src1, src2, vlen_enc);
6977     vpxor(xtmp2, dst, src1, vlen_enc);
6978     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6979   }
6980 
6981   // Compute overflow detection mask.
6982   evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6983   // Note: xtmp1 hold -1 in all its lanes after above call.
6984 
6985   // Compute mask based on first input polarity.
6986   evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6987 
6988   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6989   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6990 
6991   // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6992   // set bits in first input polarity mask holds a min value.
6993   evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6994   // Blend destination lanes with saturated values using overflow detection mask.
6995   evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6996 }
6997 
6998 
6999 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
7000                                                         XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
7001                                                         XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
7002   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
7003   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
7004   // Overflow detection based on Hacker's delight section 2-13.
7005   if (ideal_opc == Op_SaturatingAddV) {
7006     // res = src1 + src2
7007     vpadd(elem_bt, dst, src1, src2, vlen_enc);
7008     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
7009     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
7010     vpxor(xtmp1, dst, src1, vlen_enc);
7011     vpxor(xtmp2, dst, src2, vlen_enc);
7012     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
7013   } else {
7014     assert(ideal_opc == Op_SaturatingSubV, "");
7015     // res = src1 - src2
7016     vpsub(elem_bt, dst, src1, src2, vlen_enc);
7017     // Overflow occurs when both inputs have opposite polarity and
7018     // result polarity does not comply with first input polarity.
7019     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
7020     vpxor(xtmp1, src1, src2, vlen_enc);
7021     vpxor(xtmp2, dst, src1, vlen_enc);
7022     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
7023   }
7024 
7025   // Sign-extend to compute overflow detection mask.
7026   vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
7027 
7028   vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
7029   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
7030   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
7031 
7032   // Compose saturating min/max vector using first input polarity mask.
7033   vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
7034   vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
7035 
7036   // Blend result with saturating vector using overflow detection mask.
7037   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
7038 }
7039 
7040 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7041   switch(elem_bt) {
7042     case T_BYTE:
7043       if (ideal_opc == Op_SaturatingAddV) {
7044         vpaddsb(dst, src1, src2, vlen_enc);
7045       } else {
7046         assert(ideal_opc == Op_SaturatingSubV, "");
7047         vpsubsb(dst, src1, src2, vlen_enc);
7048       }
7049       break;
7050     case T_SHORT:
7051       if (ideal_opc == Op_SaturatingAddV) {
7052         vpaddsw(dst, src1, src2, vlen_enc);
7053       } else {
7054         assert(ideal_opc == Op_SaturatingSubV, "");
7055         vpsubsw(dst, src1, src2, vlen_enc);
7056       }
7057       break;
7058     default:
7059       fatal("Unsupported type %s", type2name(elem_bt));
7060       break;
7061   }
7062 }
7063 
7064 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7065   switch(elem_bt) {
7066     case T_BYTE:
7067       if (ideal_opc == Op_SaturatingAddV) {
7068         vpaddusb(dst, src1, src2, vlen_enc);
7069       } else {
7070         assert(ideal_opc == Op_SaturatingSubV, "");
7071         vpsubusb(dst, src1, src2, vlen_enc);
7072       }
7073       break;
7074     case T_SHORT:
7075       if (ideal_opc == Op_SaturatingAddV) {
7076         vpaddusw(dst, src1, src2, vlen_enc);
7077       } else {
7078         assert(ideal_opc == Op_SaturatingSubV, "");
7079         vpsubusw(dst, src1, src2, vlen_enc);
7080       }
7081       break;
7082     default:
7083       fatal("Unsupported type %s", type2name(elem_bt));
7084       break;
7085   }
7086 }
7087 
7088 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
7089                                                      XMMRegister src2, int vlen_enc) {
7090   switch(elem_bt) {
7091     case T_BYTE:
7092       evpermi2b(dst, src1, src2, vlen_enc);
7093       break;
7094     case T_SHORT:
7095       evpermi2w(dst, src1, src2, vlen_enc);
7096       break;
7097     case T_INT:
7098       evpermi2d(dst, src1, src2, vlen_enc);
7099       break;
7100     case T_LONG:
7101       evpermi2q(dst, src1, src2, vlen_enc);
7102       break;
7103     case T_FLOAT:
7104       evpermi2ps(dst, src1, src2, vlen_enc);
7105       break;
7106     case T_DOUBLE:
7107       evpermi2pd(dst, src1, src2, vlen_enc);
7108       break;
7109     default:
7110       fatal("Unsupported type %s", type2name(elem_bt));
7111       break;
7112   }
7113 }
7114 
7115 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
7116   if (is_unsigned) {
7117     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7118   } else {
7119     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7120   }
7121 }
7122 
7123 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
7124   if (is_unsigned) {
7125     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7126   } else {
7127     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7128   }
7129 }
7130 
7131 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
7132   switch(opcode) {
7133     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7134     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7135     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7136     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7137     default: assert(false, "%s", NodeClassNames[opcode]); break;
7138   }
7139 }
7140 
7141 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7142   switch(opcode) {
7143     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7144     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7145     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7146     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7147     default: assert(false, "%s", NodeClassNames[opcode]); break;
7148   }
7149 }
7150 
7151 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7152                                             KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) {
7153   vector_max_min_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit);
7154 }
7155 
7156 void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7157                                             KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
7158   if (opcode == Op_MaxVHF || opcode == Op_MaxHF) {
7159     // Move sign bits of src2 to mask register.
7160     evpmovw2m(ktmp, src2, vlen_enc);
7161     // xtmp1 = src2 < 0 ? src2 : src1
7162     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7163     // xtmp2 = src2 < 0 ? ? src1 : src2
7164     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7165     // Idea behind above swapping is to make seconds source operand a +ve value.
7166     // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in
7167     // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction,
7168     // the second source operand, either a NaN or a valid floating-point value, is returned
7169     // dst = max(xtmp1, xtmp2)
7170     evmaxph(dst, xtmp1, xtmp2, vlen_enc);
7171     // isNaN = is_unordered_quiet(xtmp1)
7172     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7173     // Final result is same as first source if its a NaN value,
7174     // in case second operand holds a NaN value then as per above semantics
7175     // result is same as second operand.
7176     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7177   } else {
7178     assert(opcode == Op_MinVHF || opcode == Op_MinHF, "");
7179     // Move sign bits of src1 to mask register.
7180     evpmovw2m(ktmp, src1, vlen_enc);
7181     // xtmp1 = src1 < 0 ? src2 : src1
7182     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7183     // xtmp2 = src1 < 0 ? src1 : src2
7184     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7185     // Idea behind above swapping is to make seconds source operand a -ve value.
7186     // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in
7187     // the second source operand is returned.
7188     // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN
7189     // or a valid floating-point value, is written to the result.
7190     // dst = min(xtmp1, xtmp2)
7191     evminph(dst, xtmp1, xtmp2, vlen_enc);
7192     // isNaN = is_unordered_quiet(xtmp1)
7193     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7194     // Final result is same as first source if its a NaN value,
7195     // in case second operand holds a NaN value then as per above semantics
7196     // result is same as second operand.
7197     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7198   }
7199 }