New src/hotspot/cpu/x86/c2_MacroAssembler

   1 /*
   2  * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "gc/shared/barrierSet.hpp"
  28 #include "gc/shared/barrierSetAssembler.hpp"
  29 #include "oops/methodData.hpp"
  30 #include "opto/c2_MacroAssembler.hpp"
  31 #include "opto/intrinsicnode.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/opcodes.hpp"
  34 #include "opto/subnode.hpp"
  35 #include "runtime/globals.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 #include "utilities/checkedCast.hpp"
  39 #include "utilities/globalDefinitions.hpp"
  40 #include "utilities/powerOfTwo.hpp"
  41 #include "utilities/sizes.hpp"
  42 
  43 #ifdef PRODUCT
  44 #define BLOCK_COMMENT(str) /* nothing */
  45 #define STOP(error) stop(error)
  46 #else
  47 #define BLOCK_COMMENT(str) block_comment(str)
  48 #define STOP(error) block_comment(error); stop(error)
  49 #endif
  50 
  51 // C2 compiled method's prolog code.
  52 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
  53 
  54   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  55   // NativeJump::patch_verified_entry will be able to patch out the entry
  56   // code safely. The push to verify stack depth is ok at 5 bytes,
  57   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  58   // stack bang then we must use the 6 byte frame allocation even if
  59   // we have no frame. :-(
  60   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  61 
  62   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  63   // Remove word for return addr
  64   framesize -= wordSize;
  65   stack_bang_size -= wordSize;
  66 
  67   // Calls to C2R adapters often do not accept exceptional returns.
  68   // We require that their callers must bang for them.  But be careful, because
  69   // some VM calls (such as call site linkage) can use several kilobytes of
  70   // stack.  But the stack safety zone should account for that.
  71   // See bugs 4446381, 4468289, 4497237.
  72   if (stack_bang_size > 0) {
  73     generate_stack_overflow_check(stack_bang_size);
  74 
  75     // We always push rbp, so that on return to interpreter rbp, will be
  76     // restored correctly and we can correct the stack.
  77     push(rbp);
  78     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  79     if (PreserveFramePointer) {
  80       mov(rbp, rsp);
  81     }
  82     // Remove word for ebp
  83     framesize -= wordSize;
  84 
  85     // Create frame
  86     if (framesize) {
  87       subptr(rsp, framesize);
  88     }
  89   } else {
  90     // Create frame (force generation of a 4 byte immediate value)
  91     subptr_imm32(rsp, framesize);
  92 
  93     // Save RBP register now.
  94     framesize -= wordSize;
  95     movptr(Address(rsp, framesize), rbp);
  96     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  97     if (PreserveFramePointer) {
  98       movptr(rbp, rsp);
  99       if (framesize > 0) {
 100         addptr(rbp, framesize);
 101       }
 102     }
 103   }
 104 
 105   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 106     framesize -= wordSize;
 107     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 108   }
 109 
 110 #ifdef ASSERT
 111   if (VerifyStackAtCalls) {
 112     Label L;
 113     push(rax);
 114     mov(rax, rsp);
 115     andptr(rax, StackAlignmentInBytes-1);
 116     cmpptr(rax, StackAlignmentInBytes-wordSize);
 117     pop(rax);
 118     jcc(Assembler::equal, L);
 119     STOP("Stack is not properly aligned!");
 120     bind(L);
 121   }
 122 #endif
 123 
 124   if (!is_stub) {
 125     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 126     // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 127     Label dummy_slow_path;
 128     Label dummy_continuation;
 129     Label* slow_path = &dummy_slow_path;
 130     Label* continuation = &dummy_continuation;
 131     if (!Compile::current()->output()->in_scratch_emit_size()) {
 132       // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 133       C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 134       Compile::current()->output()->add_stub(stub);
 135       slow_path = &stub->entry();
 136       continuation = &stub->continuation();
 137     }
 138     bs->nmethod_entry_barrier(this, slow_path, continuation);
 139   }
 140 }
 141 
 142 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 143   switch (vlen_in_bytes) {
 144     case  4: // fall-through
 145     case  8: // fall-through
 146     case 16: return Assembler::AVX_128bit;
 147     case 32: return Assembler::AVX_256bit;
 148     case 64: return Assembler::AVX_512bit;
 149 
 150     default: {
 151       ShouldNotReachHere();
 152       return Assembler::AVX_NoVec;
 153     }
 154   }
 155 }
 156 
 157 // fast_lock and fast_unlock used by C2
 158 
 159 // Because the transitions from emitted code to the runtime
 160 // monitorenter/exit helper stubs are so slow it's critical that
 161 // we inline both the stack-locking fast path and the inflated fast path.
 162 //
 163 // See also: cmpFastLock and cmpFastUnlock.
 164 //
 165 // What follows is a specialized inline transliteration of the code
 166 // in enter() and exit(). If we're concerned about I$ bloat another
 167 // option would be to emit TrySlowEnter and TrySlowExit methods
 168 // at startup-time.  These methods would accept arguments as
 169 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 170 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 171 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 172 // In practice, however, the # of lock sites is bounded and is usually small.
 173 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 174 // if the processor uses simple bimodal branch predictors keyed by EIP
 175 // Since the helper routines would be called from multiple synchronization
 176 // sites.
 177 //
 178 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 179 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 180 // to those specialized methods.  That'd give us a mostly platform-independent
 181 // implementation that the JITs could optimize and inline at their pleasure.
 182 // Done correctly, the only time we'd need to cross to native could would be
 183 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 184 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 185 // (b) explicit barriers or fence operations.
 186 //
 187 // TODO:
 188 //
 189 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 190 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 191 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 192 //    the lock operators would typically be faster than reifying Self.
 193 //
 194 // *  Ideally I'd define the primitives as:
 195 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 196 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 197 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 198 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 199 //    Furthermore the register assignments are overconstrained, possibly resulting in
 200 //    sub-optimal code near the synchronization site.
 201 //
 202 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 203 //    Alternately, use a better sp-proximity test.
 204 //
 205 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 206 //    Either one is sufficient to uniquely identify a thread.
 207 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 208 //
 209 // *  Intrinsify notify() and notifyAll() for the common cases where the
 210 //    object is locked by the calling thread but the waitlist is empty.
 211 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 212 //
 213 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 214 //    But beware of excessive branch density on AMD Opterons.
 215 //
 216 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 217 //    or failure of the fast path.  If the fast path fails then we pass
 218 //    control to the slow path, typically in C.  In fast_lock and
 219 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 220 //    will emit a conditional branch immediately after the node.
 221 //    So we have branches to branches and lots of ICC.ZF games.
 222 //    Instead, it might be better to have C2 pass a "FailureLabel"
 223 //    into fast_lock and fast_unlock.  In the case of success, control
 224 //    will drop through the node.  ICC.ZF is undefined at exit.
 225 //    In the case of failure, the node will branch directly to the
 226 //    FailureLabel
 227 
 228 
 229 // obj: object to lock
 230 // box: on-stack box address (displaced header location) - KILLED
 231 // rax,: tmp -- KILLED
 232 // scr: tmp -- KILLED
 233 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 234                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 235                                  Metadata* method_data) {
 236   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 237   // Ensure the register assignments are disjoint
 238   assert(tmpReg == rax, "");
 239   assert(cx1Reg == noreg, "");
 240   assert(cx2Reg == noreg, "");
 241   assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 242 
 243   // Possible cases that we'll encounter in fast_lock
 244   // ------------------------------------------------
 245   // * Inflated
 246   //    -- unlocked
 247   //    -- Locked
 248   //       = by self
 249   //       = by other
 250   // * neutral
 251   // * stack-locked
 252   //    -- by self
 253   //       = sp-proximity test hits
 254   //       = sp-proximity test generates false-negative
 255   //    -- by other
 256   //
 257 
 258   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 259 
 260   if (DiagnoseSyncOnValueBasedClasses != 0) {
 261     load_klass(tmpReg, objReg, scrReg);
 262     testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 263     jcc(Assembler::notZero, DONE_LABEL);
 264   }
 265 
 266   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 267   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 268   jcc(Assembler::notZero, IsInflated);
 269 
 270   if (LockingMode == LM_MONITOR) {
 271     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 272     testptr(objReg, objReg);
 273   } else {
 274     assert(LockingMode == LM_LEGACY, "must be");
 275     // Attempt stack-locking ...
 276     orptr (tmpReg, markWord::unlocked_value);
 277     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 278     lock();
 279     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 280     jcc(Assembler::equal, COUNT);           // Success
 281 
 282     // Recursive locking.
 283     // The object is stack-locked: markword contains stack pointer to BasicLock.
 284     // Locked by current thread if difference with current SP is less than one page.
 285     subptr(tmpReg, rsp);
 286     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 287     andptr(tmpReg, (int32_t) (7 - (int)os::vm_page_size()) );
 288     movptr(Address(boxReg, 0), tmpReg);
 289   }
 290   jmp(DONE_LABEL);
 291 
 292   bind(IsInflated);
 293   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 294 
 295   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 296   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 297   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 298 
 299   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 300   movptr(boxReg, Address(r15_thread, JavaThread::monitor_owner_id_offset()));
 301   movq(scrReg, tmpReg);
 302   xorq(tmpReg, tmpReg);
 303   lock();
 304   cmpxchgptr(boxReg, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 305 
 306   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 307   jccb(Assembler::equal, COUNT);    // CAS above succeeded; propagate ZF = 1 (success)
 308 
 309   cmpptr(boxReg, rax);                // Check if we are already the owner (recursive lock)
 310   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 311   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 312   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 313   bind(DONE_LABEL);
 314 
 315   // ZFlag == 1 count in fast path
 316   // ZFlag == 0 count in slow path
 317   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 318 
 319   bind(COUNT);
 320   if (LockingMode == LM_LEGACY) {
 321     // Count monitors in fast path
 322     increment(Address(thread, JavaThread::held_monitor_count_offset()));
 323   }
 324   xorl(tmpReg, tmpReg); // Set ZF == 1
 325 
 326   bind(NO_COUNT);
 327 
 328   // At NO_COUNT the icc ZFlag is set as follows ...
 329   // fast_unlock uses the same protocol.
 330   // ZFlag == 1 -> Success
 331   // ZFlag == 0 -> Failure - force control through the slow path
 332 }
 333 
 334 // obj: object to unlock
 335 // box: box address (displaced header location), killed.  Must be EAX.
 336 // tmp: killed, cannot be obj nor box.
 337 //
 338 // Some commentary on balanced locking:
 339 //
 340 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 341 // Methods that don't have provably balanced locking are forced to run in the
 342 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 343 // The interpreter provides two properties:
 344 // I1:  At return-time the interpreter automatically and quietly unlocks any
 345 //      objects acquired the current activation (frame).  Recall that the
 346 //      interpreter maintains an on-stack list of locks currently held by
 347 //      a frame.
 348 // I2:  If a method attempts to unlock an object that is not held by the
 349 //      the frame the interpreter throws IMSX.
 350 //
 351 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 352 // B() doesn't have provably balanced locking so it runs in the interpreter.
 353 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 354 // is still locked by A().
 355 //
 356 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 357 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 358 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 359 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 360 // Arguably given that the spec legislates the JNI case as undefined our implementation
 361 // could reasonably *avoid* checking owner in fast_unlock().
 362 // In the interest of performance we elide m->Owner==Self check in unlock.
 363 // A perfectly viable alternative is to elide the owner check except when
 364 // Xcheck:jni is enabled.
 365 
 366 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) {
 367   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 368   assert(boxReg == rax, "");
 369   assert_different_registers(objReg, boxReg, tmpReg);
 370 
 371   Label DONE_LABEL, Stacked, COUNT, NO_COUNT;
 372 
 373   if (LockingMode == LM_LEGACY) {
 374     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 375     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 376   }
 377   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 378   if (LockingMode != LM_MONITOR) {
 379     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 380     jcc(Assembler::zero, Stacked);
 381   }
 382 
 383   // It's inflated.
 384 
 385   // Despite our balanced locking property we still check that m->_owner == Self
 386   // as java routines or native JNI code called by this thread might
 387   // have released the lock.
 388   //
 389   // If there's no contention try a 1-0 exit.  That is, exit without
 390   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 391   // we detect and recover from the race that the 1-0 exit admits.
 392   //
 393   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 394   // before it STs null into _owner, releasing the lock.  Updates
 395   // to data protected by the critical section must be visible before
 396   // we drop the lock (and thus before any other thread could acquire
 397   // the lock and observe the fields protected by the lock).
 398   // IA32's memory-model is SPO, so STs are ordered with respect to
 399   // each other and there's no need for an explicit barrier (fence).
 400   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 401   Label LSuccess, LNotRecursive;
 402 
 403   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 404   jccb(Assembler::equal, LNotRecursive);
 405 
 406   // Recursive inflated unlock
 407   decrement(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 408   jmpb(LSuccess);
 409 
 410   bind(LNotRecursive);
 411 
 412   // Set owner to null.
 413   // Release to satisfy the JMM
 414   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 415   // We need a full fence after clearing owner to avoid stranding.
 416   // StoreLoad achieves this.
 417   membar(StoreLoad);
 418 
 419   // Check if the entry_list is empty.
 420   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(entry_list)), NULL_WORD);
 421   jccb(Assembler::zero, LSuccess);    // If so we are done.
 422 
 423   // Check if there is a successor.
 424   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 425   jccb(Assembler::notZero, LSuccess); // If so we are done.
 426 
 427   // Save the monitor pointer in the current thread, so we can try to
 428   // reacquire the lock in SharedRuntime::monitor_exit_helper().
 429   andptr(tmpReg, ~(int32_t)markWord::monitor_value);
 430   movptr(Address(r15_thread, JavaThread::unlocked_inflated_monitor_offset()), tmpReg);
 431 
 432   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 433   jmpb  (DONE_LABEL);
 434 
 435   bind  (LSuccess);
 436   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 437   jmpb  (DONE_LABEL);
 438 
 439   if (LockingMode == LM_LEGACY) {
 440     bind  (Stacked);
 441     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 442     lock();
 443     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 444     // Intentional fall-thru into DONE_LABEL
 445   }
 446 
 447   bind(DONE_LABEL);
 448 
 449   // ZFlag == 1 count in fast path
 450   // ZFlag == 0 count in slow path
 451   jccb(Assembler::notZero, NO_COUNT);
 452 
 453   bind(COUNT);
 454 
 455   if (LockingMode == LM_LEGACY) {
 456     // Count monitors in fast path
 457     decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 458   }
 459 
 460   xorl(tmpReg, tmpReg); // Set ZF == 1
 461 
 462   bind(NO_COUNT);
 463 }
 464 
 465 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
 466                                               Register t, Register thread) {
 467   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 468   assert(rax_reg == rax, "Used for CAS");
 469   assert_different_registers(obj, box, rax_reg, t, thread);
 470 
 471   // Handle inflated monitor.
 472   Label inflated;
 473   // Finish fast lock successfully. ZF value is irrelevant.
 474   Label locked;
 475   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 476   Label slow_path;
 477 
 478   if (UseObjectMonitorTable) {
 479     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 480     movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
 481   }
 482 
 483   if (DiagnoseSyncOnValueBasedClasses != 0) {
 484     load_klass(rax_reg, obj, t);
 485     testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 486     jcc(Assembler::notZero, slow_path);
 487   }
 488 
 489   const Register mark = t;
 490 
 491   { // Lightweight Lock
 492 
 493     Label push;
 494 
 495     const Register top = UseObjectMonitorTable ? rax_reg : box;
 496 
 497     // Load the mark.
 498     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 499 
 500     // Prefetch top.
 501     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 502 
 503     // Check for monitor (0b10).
 504     testptr(mark, markWord::monitor_value);
 505     jcc(Assembler::notZero, inflated);
 506 
 507     // Check if lock-stack is full.
 508     cmpl(top, LockStack::end_offset() - 1);
 509     jcc(Assembler::greater, slow_path);
 510 
 511     // Check if recursive.
 512     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 513     jccb(Assembler::equal, push);
 514 
 515     // Try to lock. Transition lock bits 0b01 => 0b00
 516     movptr(rax_reg, mark);
 517     orptr(rax_reg, markWord::unlocked_value);
 518     andptr(mark, ~(int32_t)markWord::unlocked_value);
 519     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 520     jcc(Assembler::notEqual, slow_path);
 521 
 522     if (UseObjectMonitorTable) {
 523       // Need to reload top, clobbered by CAS.
 524       movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 525     }
 526     bind(push);
 527     // After successful lock, push object on lock-stack.
 528     movptr(Address(thread, top), obj);
 529     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 530     jmpb(locked);
 531   }
 532 
 533   { // Handle inflated monitor.
 534     bind(inflated);
 535 
 536     const Register monitor = t;
 537 
 538     if (!UseObjectMonitorTable) {
 539       assert(mark == monitor, "should be the same here");
 540     } else {
 541       // Uses ObjectMonitorTable.  Look for the monitor in the om_cache.
 542       // Fetch ObjectMonitor* from the cache or take the slow-path.
 543       Label monitor_found;
 544 
 545       // Load cache address
 546       lea(t, Address(thread, JavaThread::om_cache_oops_offset()));
 547 
 548       const int num_unrolled = 2;
 549       for (int i = 0; i < num_unrolled; i++) {
 550         cmpptr(obj, Address(t));
 551         jccb(Assembler::equal, monitor_found);
 552         increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 553       }
 554 
 555       Label loop;
 556 
 557       // Search for obj in cache.
 558       bind(loop);
 559 
 560       // Check for match.
 561       cmpptr(obj, Address(t));
 562       jccb(Assembler::equal, monitor_found);
 563 
 564       // Search until null encountered, guaranteed _null_sentinel at end.
 565       cmpptr(Address(t), 1);
 566       jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0
 567       increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 568       jmpb(loop);
 569 
 570       // Cache hit.
 571       bind(monitor_found);
 572       movptr(monitor, Address(t, OMCache::oop_to_monitor_difference()));
 573     }
 574     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 575     const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 576     const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
 577 
 578     Label monitor_locked;
 579     // Lock the monitor.
 580 
 581     if (UseObjectMonitorTable) {
 582       // Cache the monitor for unlock before trashing box. On failure to acquire
 583       // the lock, the slow path will reset the entry accordingly (see CacheSetter).
 584       movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
 585     }
 586 
 587     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 588     xorptr(rax_reg, rax_reg);
 589     movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
 590     lock(); cmpxchgptr(box, owner_address);
 591     jccb(Assembler::equal, monitor_locked);
 592 
 593     // Check if recursive.
 594     cmpptr(box, rax_reg);
 595     jccb(Assembler::notEqual, slow_path);
 596 
 597     // Recursive.
 598     increment(recursions_address);
 599 
 600     bind(monitor_locked);
 601   }
 602 
 603   bind(locked);
 604   // Set ZF = 1
 605   xorl(rax_reg, rax_reg);
 606 
 607 #ifdef ASSERT
 608   // Check that locked label is reached with ZF set.
 609   Label zf_correct;
 610   Label zf_bad_zero;
 611   jcc(Assembler::zero, zf_correct);
 612   jmp(zf_bad_zero);
 613 #endif
 614 
 615   bind(slow_path);
 616 #ifdef ASSERT
 617   // Check that slow_path label is reached with ZF not set.
 618   jcc(Assembler::notZero, zf_correct);
 619   stop("Fast Lock ZF != 0");
 620   bind(zf_bad_zero);
 621   stop("Fast Lock ZF != 1");
 622   bind(zf_correct);
 623 #endif
 624   // C2 uses the value of ZF to determine the continuation.
 625 }
 626 
 627 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) {
 628   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 629   assert(reg_rax == rax, "Used for CAS");
 630   assert_different_registers(obj, reg_rax, t);
 631 
 632   // Handle inflated monitor.
 633   Label inflated, inflated_check_lock_stack;
 634   // Finish fast unlock successfully.  MUST jump with ZF == 1
 635   Label unlocked, slow_path;
 636 
 637   const Register mark = t;
 638   const Register monitor = t;
 639   const Register top = UseObjectMonitorTable ? t : reg_rax;
 640   const Register box = reg_rax;
 641 
 642   Label dummy;
 643   C2FastUnlockLightweightStub* stub = nullptr;
 644 
 645   if (!Compile::current()->output()->in_scratch_emit_size()) {
 646     stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread);
 647     Compile::current()->output()->add_stub(stub);
 648   }
 649 
 650   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 651 
 652   { // Lightweight Unlock
 653 
 654     // Load top.
 655     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 656 
 657     if (!UseObjectMonitorTable) {
 658       // Prefetch mark.
 659       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 660     }
 661 
 662     // Check if obj is top of lock-stack.
 663     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 664     // Top of lock stack was not obj. Must be monitor.
 665     jcc(Assembler::notEqual, inflated_check_lock_stack);
 666 
 667     // Pop lock-stack.
 668     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 669     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 670 
 671     // Check if recursive.
 672     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 673     jcc(Assembler::equal, unlocked);
 674 
 675     // We elide the monitor check, let the CAS fail instead.
 676 
 677     if (UseObjectMonitorTable) {
 678       // Load mark.
 679       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 680     }
 681 
 682     // Try to unlock. Transition lock bits 0b00 => 0b01
 683     movptr(reg_rax, mark);
 684     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
 685     orptr(mark, markWord::unlocked_value);
 686     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 687     jcc(Assembler::notEqual, push_and_slow_path);
 688     jmp(unlocked);
 689   }
 690 
 691 
 692   { // Handle inflated monitor.
 693     bind(inflated_check_lock_stack);
 694 #ifdef ASSERT
 695     Label check_done;
 696     subl(top, oopSize);
 697     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
 698     jcc(Assembler::below, check_done);
 699     cmpptr(obj, Address(thread, top));
 700     jccb(Assembler::notEqual, inflated_check_lock_stack);
 701     stop("Fast Unlock lock on stack");
 702     bind(check_done);
 703     if (UseObjectMonitorTable) {
 704       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 705     }
 706     testptr(mark, markWord::monitor_value);
 707     jccb(Assembler::notZero, inflated);
 708     stop("Fast Unlock not monitor");
 709 #endif
 710 
 711     bind(inflated);
 712 
 713     if (!UseObjectMonitorTable) {
 714       assert(mark == monitor, "should be the same here");
 715     } else {
 716       // Uses ObjectMonitorTable.  Look for the monitor in our BasicLock on the stack.
 717       movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 718       // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
 719       cmpptr(monitor, alignof(ObjectMonitor*));
 720       jcc(Assembler::below, slow_path);
 721     }
 722     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 723     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
 724     const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
 725     const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag};
 726     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
 727 
 728     Label recursive;
 729 
 730     // Check if recursive.
 731     cmpptr(recursions_address, 0);
 732     jccb(Assembler::notZero, recursive);
 733 
 734     // Set owner to null.
 735     // Release to satisfy the JMM
 736     movptr(owner_address, NULL_WORD);
 737     // We need a full fence after clearing owner to avoid stranding.
 738     // StoreLoad achieves this.
 739     membar(StoreLoad);
 740 
 741     // Check if the entry_list is empty.
 742     cmpptr(entry_list_address, NULL_WORD);
 743     jccb(Assembler::zero, unlocked);    // If so we are done.
 744 
 745     // Check if there is a successor.
 746     cmpptr(succ_address, NULL_WORD);
 747     jccb(Assembler::notZero, unlocked); // If so we are done.
 748 
 749     // Save the monitor pointer in the current thread, so we can try to
 750     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 751     if (!UseObjectMonitorTable) {
 752       andptr(monitor, ~(int32_t)markWord::monitor_value);
 753     }
 754     movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
 755 
 756     orl(t, 1); // Fast Unlock ZF = 0
 757     jmpb(slow_path);
 758 
 759     // Recursive unlock.
 760     bind(recursive);
 761     decrement(recursions_address);
 762   }
 763 
 764   bind(unlocked);
 765   xorl(t, t); // Fast Unlock ZF = 1
 766 
 767 #ifdef ASSERT
 768   // Check that unlocked label is reached with ZF set.
 769   Label zf_correct;
 770   Label zf_bad_zero;
 771   jcc(Assembler::zero, zf_correct);
 772   jmp(zf_bad_zero);
 773 #endif
 774 
 775   bind(slow_path);
 776   if (stub != nullptr) {
 777     bind(stub->slow_path_continuation());
 778   }
 779 #ifdef ASSERT
 780   // Check that stub->continuation() label is reached with ZF not set.
 781   jcc(Assembler::notZero, zf_correct);
 782   stop("Fast Unlock ZF != 0");
 783   bind(zf_bad_zero);
 784   stop("Fast Unlock ZF != 1");
 785   bind(zf_correct);
 786 #endif
 787   // C2 uses the value of ZF to determine the continuation.
 788 }
 789 
 790 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
 791   fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
 792 }
 793 
 794 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) {
 795   const int framesize = Compile::current()->output()->frame_size_in_bytes();
 796   masm->movptr(dst, rsp);
 797   if (framesize > 2 * wordSize) {
 798     masm->addptr(dst, framesize - 2 * wordSize);
 799   }
 800 }
 801 
 802 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
 803   if (PreserveFramePointer) {
 804     // frame pointer is valid
 805 #ifdef ASSERT
 806     // Verify frame pointer value in rbp.
 807     reconstruct_frame_pointer_helper(this, rtmp);
 808     Label L_success;
 809     cmpq(rbp, rtmp);
 810     jccb(Assembler::equal, L_success);
 811     STOP("frame pointer mismatch");
 812     bind(L_success);
 813 #endif // ASSERT
 814   } else {
 815     reconstruct_frame_pointer_helper(this, rbp);
 816   }
 817 }
 818 
 819 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) {
 820   jint lo = t->_lo;
 821   jint hi = t->_hi;
 822   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi);
 823   if (t == TypeInt::INT) {
 824     return;
 825   }
 826 
 827   BLOCK_COMMENT("CastII {");
 828   Label fail;
 829   Label succeed;
 830   if (hi == max_jint) {
 831     cmpl(val, lo);
 832     jccb(Assembler::greaterEqual, succeed);
 833   } else {
 834     if (lo != min_jint) {
 835       cmpl(val, lo);
 836       jccb(Assembler::less, fail);
 837     }
 838     cmpl(val, hi);
 839     jccb(Assembler::lessEqual, succeed);
 840   }
 841 
 842   bind(fail);
 843   movl(c_rarg0, idx);
 844   movl(c_rarg1, val);
 845   movl(c_rarg2, lo);
 846   movl(c_rarg3, hi);
 847   reconstruct_frame_pointer(rscratch1);
 848   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range)));
 849   hlt();
 850   bind(succeed);
 851   BLOCK_COMMENT("} // CastII");
 852 }
 853 
 854 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
 855   fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
 856 }
 857 
 858 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) {
 859   jlong lo = t->_lo;
 860   jlong hi = t->_hi;
 861   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi);
 862   if (t == TypeLong::LONG) {
 863     return;
 864   }
 865 
 866   BLOCK_COMMENT("CastLL {");
 867   Label fail;
 868   Label succeed;
 869 
 870   auto cmp_val = [&](jlong bound) {
 871     if (is_simm32(bound)) {
 872       cmpq(val, checked_cast<int>(bound));
 873     } else {
 874       mov64(tmp, bound);
 875       cmpq(val, tmp);
 876     }
 877   };
 878 
 879   if (hi == max_jlong) {
 880     cmp_val(lo);
 881     jccb(Assembler::greaterEqual, succeed);
 882   } else {
 883     if (lo != min_jlong) {
 884       cmp_val(lo);
 885       jccb(Assembler::less, fail);
 886     }
 887     cmp_val(hi);
 888     jccb(Assembler::lessEqual, succeed);
 889   }
 890 
 891   bind(fail);
 892   movl(c_rarg0, idx);
 893   movq(c_rarg1, val);
 894   mov64(c_rarg2, lo);
 895   mov64(c_rarg3, hi);
 896   reconstruct_frame_pointer(rscratch1);
 897   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range)));
 898   hlt();
 899   bind(succeed);
 900   BLOCK_COMMENT("} // CastLL");
 901 }
 902 
 903 //-------------------------------------------------------------------------------------------
 904 // Generic instructions support for use in .ad files C2 code generation
 905 
 906 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 907   if (dst != src) {
 908     movdqu(dst, src);
 909   }
 910   if (opcode == Op_AbsVD) {
 911     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 912   } else {
 913     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 914     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 915   }
 916 }
 917 
 918 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 919   if (opcode == Op_AbsVD) {
 920     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 921   } else {
 922     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 923     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 924   }
 925 }
 926 
 927 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 928   if (dst != src) {
 929     movdqu(dst, src);
 930   }
 931   if (opcode == Op_AbsVF) {
 932     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 933   } else {
 934     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 935     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 936   }
 937 }
 938 
 939 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 940   if (opcode == Op_AbsVF) {
 941     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 942   } else {
 943     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 944     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 945   }
 946 }
 947 
 948 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 949   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 950   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 951 
 952   if (opcode == Op_MinV) {
 953     if (elem_bt == T_BYTE) {
 954       pminsb(dst, src);
 955     } else if (elem_bt == T_SHORT) {
 956       pminsw(dst, src);
 957     } else if (elem_bt == T_INT) {
 958       pminsd(dst, src);
 959     } else {
 960       assert(elem_bt == T_LONG, "required");
 961       assert(tmp == xmm0, "required");
 962       assert_different_registers(dst, src, tmp);
 963       movdqu(xmm0, dst);
 964       pcmpgtq(xmm0, src);
 965       blendvpd(dst, src);  // xmm0 as mask
 966     }
 967   } else { // opcode == Op_MaxV
 968     if (elem_bt == T_BYTE) {
 969       pmaxsb(dst, src);
 970     } else if (elem_bt == T_SHORT) {
 971       pmaxsw(dst, src);
 972     } else if (elem_bt == T_INT) {
 973       pmaxsd(dst, src);
 974     } else {
 975       assert(elem_bt == T_LONG, "required");
 976       assert(tmp == xmm0, "required");
 977       assert_different_registers(dst, src, tmp);
 978       movdqu(xmm0, src);
 979       pcmpgtq(xmm0, dst);
 980       blendvpd(dst, src);  // xmm0 as mask
 981     }
 982   }
 983 }
 984 
 985 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 986                                   XMMRegister src1, Address src2, int vlen_enc) {
 987   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 988   if (opcode == Op_UMinV) {
 989     switch(elem_bt) {
 990       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 991       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 992       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 993       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 994       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 995     }
 996   } else {
 997     assert(opcode == Op_UMaxV, "required");
 998     switch(elem_bt) {
 999       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
1000       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
1001       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
1002       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
1003       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
1004     }
1005   }
1006 }
1007 
1008 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
1009   // For optimality, leverage a full vector width of 512 bits
1010   // for operations over smaller vector sizes on AVX512 targets.
1011   if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
1012     if (opcode == Op_UMaxV) {
1013       evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
1014     } else {
1015       assert(opcode == Op_UMinV, "required");
1016       evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
1017     }
1018   } else {
1019     // T1 = -1
1020     vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
1021     // T1 = -1 << 63
1022     vpsllq(xtmp1, xtmp1, 63, vlen_enc);
1023     // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
1024     vpaddq(xtmp2, xtmp1, src2, vlen_enc);
1025     // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
1026     vpaddq(xtmp1, xtmp1, src1, vlen_enc);
1027     // Mask = T2 > T1
1028     vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
1029     if (opcode == Op_UMaxV) {
1030       // Res = Mask ? Src2 : Src1
1031       vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
1032     } else {
1033       // Res = Mask ? Src1 : Src2
1034       vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
1035     }
1036   }
1037 }
1038 
1039 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
1040                                   XMMRegister src1, XMMRegister src2, int vlen_enc) {
1041   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
1042   if (opcode == Op_UMinV) {
1043     switch(elem_bt) {
1044       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
1045       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
1046       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
1047       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
1048       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
1049     }
1050   } else {
1051     assert(opcode == Op_UMaxV, "required");
1052     switch(elem_bt) {
1053       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
1054       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
1055       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
1056       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
1057       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
1058     }
1059   }
1060 }
1061 
1062 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
1063                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
1064                                  int vlen_enc) {
1065   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1066 
1067   if (opcode == Op_MinV) {
1068     if (elem_bt == T_BYTE) {
1069       vpminsb(dst, src1, src2, vlen_enc);
1070     } else if (elem_bt == T_SHORT) {
1071       vpminsw(dst, src1, src2, vlen_enc);
1072     } else if (elem_bt == T_INT) {
1073       vpminsd(dst, src1, src2, vlen_enc);
1074     } else {
1075       assert(elem_bt == T_LONG, "required");
1076       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1077         vpminsq(dst, src1, src2, vlen_enc);
1078       } else {
1079         assert_different_registers(dst, src1, src2);
1080         vpcmpgtq(dst, src1, src2, vlen_enc);
1081         vblendvpd(dst, src1, src2, dst, vlen_enc);
1082       }
1083     }
1084   } else { // opcode == Op_MaxV
1085     if (elem_bt == T_BYTE) {
1086       vpmaxsb(dst, src1, src2, vlen_enc);
1087     } else if (elem_bt == T_SHORT) {
1088       vpmaxsw(dst, src1, src2, vlen_enc);
1089     } else if (elem_bt == T_INT) {
1090       vpmaxsd(dst, src1, src2, vlen_enc);
1091     } else {
1092       assert(elem_bt == T_LONG, "required");
1093       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1094         vpmaxsq(dst, src1, src2, vlen_enc);
1095       } else {
1096         assert_different_registers(dst, src1, src2);
1097         vpcmpgtq(dst, src1, src2, vlen_enc);
1098         vblendvpd(dst, src2, src1, dst, vlen_enc);
1099       }
1100     }
1101   }
1102 }
1103 
1104 // Float/Double min max
1105 
1106 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1107                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1108                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1109                                    int vlen_enc) {
1110   assert(UseAVX > 0, "required");
1111   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1112          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1113   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1114   assert_different_registers(a, tmp, atmp, btmp);
1115   assert_different_registers(b, tmp, atmp, btmp);
1116 
1117   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1118   bool is_double_word = is_double_word_type(elem_bt);
1119 
1120   /* Note on 'non-obvious' assembly sequence:
1121    *
1122    * While there are vminps/vmaxps instructions, there are two important differences between hardware
1123    * and Java on how they handle floats:
1124    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
1125    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
1126    *
1127    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
1128    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
1129    *                (only useful when signs differ, noop otherwise)
1130    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
1131 
1132    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
1133    *   btmp = (b < +0.0) ? a : b
1134    *   atmp = (b < +0.0) ? b : a
1135    *   Tmp  = Max_Float(atmp , btmp)
1136    *   Res  = (atmp == NaN) ? atmp : Tmp
1137    */
1138 
1139   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
1140   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
1141   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
1142   XMMRegister mask;
1143 
1144   if (!is_double_word && is_min) {
1145     mask = a;
1146     vblend = &MacroAssembler::vblendvps;
1147     vmaxmin = &MacroAssembler::vminps;
1148     vcmp = &MacroAssembler::vcmpps;
1149   } else if (!is_double_word && !is_min) {
1150     mask = b;
1151     vblend = &MacroAssembler::vblendvps;
1152     vmaxmin = &MacroAssembler::vmaxps;
1153     vcmp = &MacroAssembler::vcmpps;
1154   } else if (is_double_word && is_min) {
1155     mask = a;
1156     vblend = &MacroAssembler::vblendvpd;
1157     vmaxmin = &MacroAssembler::vminpd;
1158     vcmp = &MacroAssembler::vcmppd;
1159   } else {
1160     assert(is_double_word && !is_min, "sanity");
1161     mask = b;
1162     vblend = &MacroAssembler::vblendvpd;
1163     vmaxmin = &MacroAssembler::vmaxpd;
1164     vcmp = &MacroAssembler::vcmppd;
1165   }
1166 
1167   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
1168   XMMRegister maxmin, scratch;
1169   if (dst == btmp) {
1170     maxmin = btmp;
1171     scratch = tmp;
1172   } else {
1173     maxmin = tmp;
1174     scratch = btmp;
1175   }
1176 
1177   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1178   if (precompute_mask && !is_double_word) {
1179     vpsrad(tmp, mask, 32, vlen_enc);
1180     mask = tmp;
1181   } else if (precompute_mask && is_double_word) {
1182     vpxor(tmp, tmp, tmp, vlen_enc);
1183     vpcmpgtq(tmp, tmp, mask, vlen_enc);
1184     mask = tmp;
1185   }
1186 
1187   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1188   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1189   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1190   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1191   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1192 }
1193 
1194 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1195                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1196                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1197                                     int vlen_enc) {
1198   assert(UseAVX > 2, "required");
1199   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1200          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1201   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1202   assert_different_registers(dst, a, atmp, btmp);
1203   assert_different_registers(dst, b, atmp, btmp);
1204 
1205   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1206   bool is_double_word = is_double_word_type(elem_bt);
1207   bool merge = true;
1208 
1209   if (!is_double_word && is_min) {
1210     evpmovd2m(ktmp, a, vlen_enc);
1211     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1212     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1213     vminps(dst, atmp, btmp, vlen_enc);
1214     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1215     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1216   } else if (!is_double_word && !is_min) {
1217     evpmovd2m(ktmp, b, vlen_enc);
1218     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1219     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1220     vmaxps(dst, atmp, btmp, vlen_enc);
1221     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1222     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1223   } else if (is_double_word && is_min) {
1224     evpmovq2m(ktmp, a, vlen_enc);
1225     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1226     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1227     vminpd(dst, atmp, btmp, vlen_enc);
1228     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1229     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1230   } else {
1231     assert(is_double_word && !is_min, "sanity");
1232     evpmovq2m(ktmp, b, vlen_enc);
1233     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1234     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1235     vmaxpd(dst, atmp, btmp, vlen_enc);
1236     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1237     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1238   }
1239 }
1240 
1241 // Float/Double signum
1242 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1243   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1244 
1245   Label DONE_LABEL;
1246 
1247   if (opcode == Op_SignumF) {
1248     ucomiss(dst, zero);
1249     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1250     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1251     movflt(dst, one);
1252     jcc(Assembler::above, DONE_LABEL);
1253     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1254   } else if (opcode == Op_SignumD) {
1255     ucomisd(dst, zero);
1256     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1257     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1258     movdbl(dst, one);
1259     jcc(Assembler::above, DONE_LABEL);
1260     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1261   }
1262 
1263   bind(DONE_LABEL);
1264 }
1265 
1266 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1267   if (sign) {
1268     pmovsxbw(dst, src);
1269   } else {
1270     pmovzxbw(dst, src);
1271   }
1272 }
1273 
1274 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1275   if (sign) {
1276     vpmovsxbw(dst, src, vector_len);
1277   } else {
1278     vpmovzxbw(dst, src, vector_len);
1279   }
1280 }
1281 
1282 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1283   if (sign) {
1284     vpmovsxbd(dst, src, vector_len);
1285   } else {
1286     vpmovzxbd(dst, src, vector_len);
1287   }
1288 }
1289 
1290 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1291   if (sign) {
1292     vpmovsxwd(dst, src, vector_len);
1293   } else {
1294     vpmovzxwd(dst, src, vector_len);
1295   }
1296 }
1297 
1298 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1299                                      int shift, int vector_len) {
1300   if (opcode == Op_RotateLeftV) {
1301     if (etype == T_INT) {
1302       evprold(dst, src, shift, vector_len);
1303     } else {
1304       assert(etype == T_LONG, "expected type T_LONG");
1305       evprolq(dst, src, shift, vector_len);
1306     }
1307   } else {
1308     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1309     if (etype == T_INT) {
1310       evprord(dst, src, shift, vector_len);
1311     } else {
1312       assert(etype == T_LONG, "expected type T_LONG");
1313       evprorq(dst, src, shift, vector_len);
1314     }
1315   }
1316 }
1317 
1318 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1319                                      XMMRegister shift, int vector_len) {
1320   if (opcode == Op_RotateLeftV) {
1321     if (etype == T_INT) {
1322       evprolvd(dst, src, shift, vector_len);
1323     } else {
1324       assert(etype == T_LONG, "expected type T_LONG");
1325       evprolvq(dst, src, shift, vector_len);
1326     }
1327   } else {
1328     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1329     if (etype == T_INT) {
1330       evprorvd(dst, src, shift, vector_len);
1331     } else {
1332       assert(etype == T_LONG, "expected type T_LONG");
1333       evprorvq(dst, src, shift, vector_len);
1334     }
1335   }
1336 }
1337 
1338 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1339   if (opcode == Op_RShiftVI) {
1340     psrad(dst, shift);
1341   } else if (opcode == Op_LShiftVI) {
1342     pslld(dst, shift);
1343   } else {
1344     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1345     psrld(dst, shift);
1346   }
1347 }
1348 
1349 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1350   switch (opcode) {
1351     case Op_RShiftVI:  psrad(dst, shift); break;
1352     case Op_LShiftVI:  pslld(dst, shift); break;
1353     case Op_URShiftVI: psrld(dst, shift); break;
1354 
1355     default: assert(false, "%s", NodeClassNames[opcode]);
1356   }
1357 }
1358 
1359 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1360   if (opcode == Op_RShiftVI) {
1361     vpsrad(dst, nds, shift, vector_len);
1362   } else if (opcode == Op_LShiftVI) {
1363     vpslld(dst, nds, shift, vector_len);
1364   } else {
1365     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1366     vpsrld(dst, nds, shift, vector_len);
1367   }
1368 }
1369 
1370 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1371   switch (opcode) {
1372     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1373     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1374     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1375 
1376     default: assert(false, "%s", NodeClassNames[opcode]);
1377   }
1378 }
1379 
1380 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1381   switch (opcode) {
1382     case Op_RShiftVB:  // fall-through
1383     case Op_RShiftVS:  psraw(dst, shift); break;
1384 
1385     case Op_LShiftVB:  // fall-through
1386     case Op_LShiftVS:  psllw(dst, shift);   break;
1387 
1388     case Op_URShiftVS: // fall-through
1389     case Op_URShiftVB: psrlw(dst, shift);  break;
1390 
1391     default: assert(false, "%s", NodeClassNames[opcode]);
1392   }
1393 }
1394 
1395 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1396   switch (opcode) {
1397     case Op_RShiftVB:  // fall-through
1398     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1399 
1400     case Op_LShiftVB:  // fall-through
1401     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1402 
1403     case Op_URShiftVS: // fall-through
1404     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1405 
1406     default: assert(false, "%s", NodeClassNames[opcode]);
1407   }
1408 }
1409 
1410 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1411   switch (opcode) {
1412     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1413     case Op_LShiftVL:  psllq(dst, shift); break;
1414     case Op_URShiftVL: psrlq(dst, shift); break;
1415 
1416     default: assert(false, "%s", NodeClassNames[opcode]);
1417   }
1418 }
1419 
1420 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1421   if (opcode == Op_RShiftVL) {
1422     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1423   } else if (opcode == Op_LShiftVL) {
1424     psllq(dst, shift);
1425   } else {
1426     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1427     psrlq(dst, shift);
1428   }
1429 }
1430 
1431 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1432   switch (opcode) {
1433     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1434     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1435     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1436 
1437     default: assert(false, "%s", NodeClassNames[opcode]);
1438   }
1439 }
1440 
1441 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1442   if (opcode == Op_RShiftVL) {
1443     evpsraq(dst, nds, shift, vector_len);
1444   } else if (opcode == Op_LShiftVL) {
1445     vpsllq(dst, nds, shift, vector_len);
1446   } else {
1447     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1448     vpsrlq(dst, nds, shift, vector_len);
1449   }
1450 }
1451 
1452 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1453   switch (opcode) {
1454     case Op_RShiftVB:  // fall-through
1455     case Op_RShiftVS:  // fall-through
1456     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1457 
1458     case Op_LShiftVB:  // fall-through
1459     case Op_LShiftVS:  // fall-through
1460     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1461 
1462     case Op_URShiftVB: // fall-through
1463     case Op_URShiftVS: // fall-through
1464     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1465 
1466     default: assert(false, "%s", NodeClassNames[opcode]);
1467   }
1468 }
1469 
1470 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1471   switch (opcode) {
1472     case Op_RShiftVB:  // fall-through
1473     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1474 
1475     case Op_LShiftVB:  // fall-through
1476     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1477 
1478     case Op_URShiftVB: // fall-through
1479     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1480 
1481     default: assert(false, "%s", NodeClassNames[opcode]);
1482   }
1483 }
1484 
1485 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1486   assert(UseAVX >= 2, "required");
1487   switch (opcode) {
1488     case Op_RShiftVL: {
1489       if (UseAVX > 2) {
1490         assert(tmp == xnoreg, "not used");
1491         if (!VM_Version::supports_avx512vl()) {
1492           vlen_enc = Assembler::AVX_512bit;
1493         }
1494         evpsravq(dst, src, shift, vlen_enc);
1495       } else {
1496         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1497         vpsrlvq(dst, src, shift, vlen_enc);
1498         vpsrlvq(tmp, tmp, shift, vlen_enc);
1499         vpxor(dst, dst, tmp, vlen_enc);
1500         vpsubq(dst, dst, tmp, vlen_enc);
1501       }
1502       break;
1503     }
1504     case Op_LShiftVL: {
1505       assert(tmp == xnoreg, "not used");
1506       vpsllvq(dst, src, shift, vlen_enc);
1507       break;
1508     }
1509     case Op_URShiftVL: {
1510       assert(tmp == xnoreg, "not used");
1511       vpsrlvq(dst, src, shift, vlen_enc);
1512       break;
1513     }
1514     default: assert(false, "%s", NodeClassNames[opcode]);
1515   }
1516 }
1517 
1518 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1519 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1520   assert(opcode == Op_LShiftVB ||
1521          opcode == Op_RShiftVB ||
1522          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1523   bool sign = (opcode != Op_URShiftVB);
1524   assert(vector_len == 0, "required");
1525   vextendbd(sign, dst, src, 1);
1526   vpmovzxbd(vtmp, shift, 1);
1527   varshiftd(opcode, dst, dst, vtmp, 1);
1528   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1529   vextracti128_high(vtmp, dst);
1530   vpackusdw(dst, dst, vtmp, 0);
1531 }
1532 
1533 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1534 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1535   assert(opcode == Op_LShiftVB ||
1536          opcode == Op_RShiftVB ||
1537          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1538   bool sign = (opcode != Op_URShiftVB);
1539   int ext_vector_len = vector_len + 1;
1540   vextendbw(sign, dst, src, ext_vector_len);
1541   vpmovzxbw(vtmp, shift, ext_vector_len);
1542   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1543   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1544   if (vector_len == 0) {
1545     vextracti128_high(vtmp, dst);
1546     vpackuswb(dst, dst, vtmp, vector_len);
1547   } else {
1548     vextracti64x4_high(vtmp, dst);
1549     vpackuswb(dst, dst, vtmp, vector_len);
1550     vpermq(dst, dst, 0xD8, vector_len);
1551   }
1552 }
1553 
1554 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1555   switch(typ) {
1556     case T_BYTE:
1557       pinsrb(dst, val, idx);
1558       break;
1559     case T_SHORT:
1560       pinsrw(dst, val, idx);
1561       break;
1562     case T_INT:
1563       pinsrd(dst, val, idx);
1564       break;
1565     case T_LONG:
1566       pinsrq(dst, val, idx);
1567       break;
1568     default:
1569       assert(false,"Should not reach here.");
1570       break;
1571   }
1572 }
1573 
1574 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1575   switch(typ) {
1576     case T_BYTE:
1577       vpinsrb(dst, src, val, idx);
1578       break;
1579     case T_SHORT:
1580       vpinsrw(dst, src, val, idx);
1581       break;
1582     case T_INT:
1583       vpinsrd(dst, src, val, idx);
1584       break;
1585     case T_LONG:
1586       vpinsrq(dst, src, val, idx);
1587       break;
1588     default:
1589       assert(false,"Should not reach here.");
1590       break;
1591   }
1592 }
1593 
1594 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt,
1595                                                 XMMRegister dst, Register base,
1596                                                 Register idx_base,
1597                                                 Register offset, Register mask,
1598                                                 Register mask_idx, Register rtmp,
1599                                                 int vlen_enc) {
1600   vpxor(dst, dst, dst, vlen_enc);
1601   if (elem_bt == T_SHORT) {
1602     for (int i = 0; i < 4; i++) {
1603       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1604       Label skip_load;
1605       btq(mask, mask_idx);
1606       jccb(Assembler::carryClear, skip_load);
1607       movl(rtmp, Address(idx_base, i * 4));
1608       if (offset != noreg) {
1609         addl(rtmp, offset);
1610       }
1611       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1612       bind(skip_load);
1613       incq(mask_idx);
1614     }
1615   } else {
1616     assert(elem_bt == T_BYTE, "");
1617     for (int i = 0; i < 8; i++) {
1618       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1619       Label skip_load;
1620       btq(mask, mask_idx);
1621       jccb(Assembler::carryClear, skip_load);
1622       movl(rtmp, Address(idx_base, i * 4));
1623       if (offset != noreg) {
1624         addl(rtmp, offset);
1625       }
1626       pinsrb(dst, Address(base, rtmp), i);
1627       bind(skip_load);
1628       incq(mask_idx);
1629     }
1630   }
1631 }
1632 
1633 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst,
1634                                          Register base, Register idx_base,
1635                                          Register offset, Register rtmp,
1636                                          int vlen_enc) {
1637   vpxor(dst, dst, dst, vlen_enc);
1638   if (elem_bt == T_SHORT) {
1639     for (int i = 0; i < 4; i++) {
1640       // dst[i] = src[offset + idx_base[i]]
1641       movl(rtmp, Address(idx_base, i * 4));
1642       if (offset != noreg) {
1643         addl(rtmp, offset);
1644       }
1645       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1646     }
1647   } else {
1648     assert(elem_bt == T_BYTE, "");
1649     for (int i = 0; i < 8; i++) {
1650       // dst[i] = src[offset + idx_base[i]]
1651       movl(rtmp, Address(idx_base, i * 4));
1652       if (offset != noreg) {
1653         addl(rtmp, offset);
1654       }
1655       pinsrb(dst, Address(base, rtmp), i);
1656     }
1657   }
1658 }
1659 
1660 /*
1661  * Gather using hybrid algorithm, first partially unroll scalar loop
1662  * to accumulate values from gather indices into a quad-word(64bit) slice.
1663  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1664  * permutation to place the slice into appropriate vector lane
1665  * locations in destination vector. Following pseudo code describes the
1666  * algorithm in detail:
1667  *
1668  * DST_VEC = ZERO_VEC
1669  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1670  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1671  * FOREACH_ITER:
1672  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1673  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1674  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1675  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1676  *
1677  * With each iteration, doubleword permute indices (0,1) corresponding
1678  * to gathered quadword gets right shifted by two lane positions.
1679  *
1680  */
1681 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1682                                         Register base, Register idx_base,
1683                                         Register offset, Register mask,
1684                                         XMMRegister xtmp1, XMMRegister xtmp2,
1685                                         XMMRegister temp_dst, Register rtmp,
1686                                         Register mask_idx, Register length,
1687                                         int vector_len, int vlen_enc) {
1688   Label GATHER8_LOOP;
1689   assert(is_subword_type(elem_ty), "");
1690   movl(length, vector_len);
1691   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1692   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1693   vallones(xtmp2, vlen_enc);
1694   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1695   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1696   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1697 
1698   bind(GATHER8_LOOP);
1699     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1700     if (mask == noreg) {
1701       vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc);
1702     } else {
1703       vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc);
1704     }
1705     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1706     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1707     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1708     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1709     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1710     vpor(dst, dst, temp_dst, vlen_enc);
1711     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1712     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1713     jcc(Assembler::notEqual, GATHER8_LOOP);
1714 }
1715 
1716 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1717   switch(typ) {
1718     case T_INT:
1719       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1720       break;
1721     case T_FLOAT:
1722       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1723       break;
1724     case T_LONG:
1725       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1726       break;
1727     case T_DOUBLE:
1728       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1729       break;
1730     default:
1731       assert(false,"Should not reach here.");
1732       break;
1733   }
1734 }
1735 
1736 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1737   switch(typ) {
1738     case T_INT:
1739       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1740       break;
1741     case T_FLOAT:
1742       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1743       break;
1744     case T_LONG:
1745       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1746       break;
1747     case T_DOUBLE:
1748       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1749       break;
1750     default:
1751       assert(false,"Should not reach here.");
1752       break;
1753   }
1754 }
1755 
1756 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1757   switch(typ) {
1758     case T_INT:
1759       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1760       break;
1761     case T_FLOAT:
1762       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1763       break;
1764     case T_LONG:
1765       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1766       break;
1767     case T_DOUBLE:
1768       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1769       break;
1770     default:
1771       assert(false,"Should not reach here.");
1772       break;
1773   }
1774 }
1775 
1776 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1777   if (vlen_in_bytes <= 16) {
1778     pxor (dst, dst);
1779     psubb(dst, src);
1780     switch (elem_bt) {
1781       case T_BYTE:   /* nothing to do */ break;
1782       case T_SHORT:  pmovsxbw(dst, dst); break;
1783       case T_INT:    pmovsxbd(dst, dst); break;
1784       case T_FLOAT:  pmovsxbd(dst, dst); break;
1785       case T_LONG:   pmovsxbq(dst, dst); break;
1786       case T_DOUBLE: pmovsxbq(dst, dst); break;
1787 
1788       default: assert(false, "%s", type2name(elem_bt));
1789     }
1790   } else {
1791     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1792     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1793 
1794     vpxor (dst, dst, dst, vlen_enc);
1795     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1796 
1797     switch (elem_bt) {
1798       case T_BYTE:   /* nothing to do */            break;
1799       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1800       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1801       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1802       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1803       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1804 
1805       default: assert(false, "%s", type2name(elem_bt));
1806     }
1807   }
1808 }
1809 
1810 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1811   if (novlbwdq) {
1812     vpmovsxbd(xtmp, src, vlen_enc);
1813     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1814             Assembler::eq, true, vlen_enc, noreg);
1815   } else {
1816     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1817     vpsubb(xtmp, xtmp, src, vlen_enc);
1818     evpmovb2m(dst, xtmp, vlen_enc);
1819   }
1820 }
1821 
1822 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1823   if (is_integral_type(bt)) {
1824     switch (vlen_in_bytes) {
1825       case 4:  movdl(dst, src);   break;
1826       case 8:  movq(dst, src);    break;
1827       case 16: movdqu(dst, src);  break;
1828       case 32: vmovdqu(dst, src); break;
1829       case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1830       default: ShouldNotReachHere();
1831     }
1832   } else {
1833     switch (vlen_in_bytes) {
1834       case 4:  movflt(dst, src); break;
1835       case 8:  movdbl(dst, src); break;
1836       case 16: movups(dst, src); break;
1837       case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1838       case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1839       default: ShouldNotReachHere();
1840     }
1841   }
1842 }
1843 
1844 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1845   assert(rscratch != noreg || always_reachable(src), "missing");
1846 
1847   if (reachable(src)) {
1848     load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1849   } else {
1850     lea(rscratch, src);
1851     load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1852   }
1853 }
1854 
1855 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1856   int vlen_enc = vector_length_encoding(vlen);
1857   if (VM_Version::supports_avx()) {
1858     if (bt == T_LONG) {
1859       if (VM_Version::supports_avx2()) {
1860         vpbroadcastq(dst, src, vlen_enc);
1861       } else {
1862         vmovddup(dst, src, vlen_enc);
1863       }
1864     } else if (bt == T_DOUBLE) {
1865       if (vlen_enc != Assembler::AVX_128bit) {
1866         vbroadcastsd(dst, src, vlen_enc, noreg);
1867       } else {
1868         vmovddup(dst, src, vlen_enc);
1869       }
1870     } else {
1871       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1872         vpbroadcastd(dst, src, vlen_enc);
1873       } else {
1874         vbroadcastss(dst, src, vlen_enc);
1875       }
1876     }
1877   } else if (VM_Version::supports_sse3()) {
1878     movddup(dst, src);
1879   } else {
1880     load_vector(bt, dst, src, vlen);
1881   }
1882 }
1883 
1884 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1885   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1886   int offset = exact_log2(type2aelembytes(bt)) << 6;
1887   if (is_floating_point_type(bt)) {
1888     offset += 128;
1889   }
1890   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1891   load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1892 }
1893 
1894 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1895 
1896 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1897   int vector_len = Assembler::AVX_128bit;
1898 
1899   switch (opcode) {
1900     case Op_AndReductionV:  pand(dst, src); break;
1901     case Op_OrReductionV:   por (dst, src); break;
1902     case Op_XorReductionV:  pxor(dst, src); break;
1903     case Op_MinReductionV:
1904       switch (typ) {
1905         case T_BYTE:        pminsb(dst, src); break;
1906         case T_SHORT:       pminsw(dst, src); break;
1907         case T_INT:         pminsd(dst, src); break;
1908         case T_LONG:        assert(UseAVX > 2, "required");
1909                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1910         default:            assert(false, "wrong type");
1911       }
1912       break;
1913     case Op_MaxReductionV:
1914       switch (typ) {
1915         case T_BYTE:        pmaxsb(dst, src); break;
1916         case T_SHORT:       pmaxsw(dst, src); break;
1917         case T_INT:         pmaxsd(dst, src); break;
1918         case T_LONG:        assert(UseAVX > 2, "required");
1919                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1920         default:            assert(false, "wrong type");
1921       }
1922       break;
1923     case Op_AddReductionVF: addss(dst, src); break;
1924     case Op_AddReductionVD: addsd(dst, src); break;
1925     case Op_AddReductionVI:
1926       switch (typ) {
1927         case T_BYTE:        paddb(dst, src); break;
1928         case T_SHORT:       paddw(dst, src); break;
1929         case T_INT:         paddd(dst, src); break;
1930         default:            assert(false, "wrong type");
1931       }
1932       break;
1933     case Op_AddReductionVL: paddq(dst, src); break;
1934     case Op_MulReductionVF: mulss(dst, src); break;
1935     case Op_MulReductionVD: mulsd(dst, src); break;
1936     case Op_MulReductionVI:
1937       switch (typ) {
1938         case T_SHORT:       pmullw(dst, src); break;
1939         case T_INT:         pmulld(dst, src); break;
1940         default:            assert(false, "wrong type");
1941       }
1942       break;
1943     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1944                             evpmullq(dst, dst, src, vector_len); break;
1945     default:                assert(false, "wrong opcode");
1946   }
1947 }
1948 
1949 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1950   switch (opcode) {
1951     case Op_AddReductionVF: addps(dst, src); break;
1952     case Op_AddReductionVD: addpd(dst, src); break;
1953     case Op_MulReductionVF: mulps(dst, src); break;
1954     case Op_MulReductionVD: mulpd(dst, src); break;
1955     default:                assert(false, "%s", NodeClassNames[opcode]);
1956   }
1957 }
1958 
1959 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1960   int vector_len = Assembler::AVX_256bit;
1961 
1962   switch (opcode) {
1963     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1964     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1965     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1966     case Op_MinReductionV:
1967       switch (typ) {
1968         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1969         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1970         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1971         case T_LONG:        assert(UseAVX > 2, "required");
1972                             vpminsq(dst, src1, src2, vector_len); break;
1973         default:            assert(false, "wrong type");
1974       }
1975       break;
1976     case Op_MaxReductionV:
1977       switch (typ) {
1978         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1979         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1980         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1981         case T_LONG:        assert(UseAVX > 2, "required");
1982                             vpmaxsq(dst, src1, src2, vector_len); break;
1983         default:            assert(false, "wrong type");
1984       }
1985       break;
1986     case Op_AddReductionVI:
1987       switch (typ) {
1988         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1989         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1990         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1991         default:            assert(false, "wrong type");
1992       }
1993       break;
1994     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1995     case Op_MulReductionVI:
1996       switch (typ) {
1997         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1998         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1999         default:            assert(false, "wrong type");
2000       }
2001       break;
2002     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
2003     default:                assert(false, "wrong opcode");
2004   }
2005 }
2006 
2007 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
2008   int vector_len = Assembler::AVX_256bit;
2009 
2010   switch (opcode) {
2011     case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
2012     case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
2013     case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
2014     case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
2015     default:                assert(false, "%s", NodeClassNames[opcode]);
2016   }
2017 }
2018 
2019 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
2020                                   XMMRegister dst, XMMRegister src,
2021                                   XMMRegister vtmp1, XMMRegister vtmp2) {
2022   switch (opcode) {
2023     case Op_AddReductionVF:
2024     case Op_MulReductionVF:
2025       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
2026       break;
2027 
2028     case Op_AddReductionVD:
2029     case Op_MulReductionVD:
2030       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
2031       break;
2032 
2033     default: assert(false, "wrong opcode");
2034   }
2035 }
2036 
2037 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
2038                                             XMMRegister dst, XMMRegister src,
2039                                             XMMRegister vtmp1, XMMRegister vtmp2) {
2040   switch (opcode) {
2041     case Op_AddReductionVF:
2042     case Op_MulReductionVF:
2043       unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
2044       break;
2045 
2046     case Op_AddReductionVD:
2047     case Op_MulReductionVD:
2048       unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
2049       break;
2050 
2051     default: assert(false, "%s", NodeClassNames[opcode]);
2052   }
2053 }
2054 
2055 void C2_MacroAssembler::reduceB(int opcode, int vlen,
2056                              Register dst, Register src1, XMMRegister src2,
2057                              XMMRegister vtmp1, XMMRegister vtmp2) {
2058   switch (vlen) {
2059     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2060     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2061     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2062     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2063 
2064     default: assert(false, "wrong vector length");
2065   }
2066 }
2067 
2068 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
2069                              Register dst, Register src1, XMMRegister src2,
2070                              XMMRegister vtmp1, XMMRegister vtmp2) {
2071   switch (vlen) {
2072     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2073     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2074     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2075     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2076 
2077     default: assert(false, "wrong vector length");
2078   }
2079 }
2080 
2081 void C2_MacroAssembler::reduceS(int opcode, int vlen,
2082                              Register dst, Register src1, XMMRegister src2,
2083                              XMMRegister vtmp1, XMMRegister vtmp2) {
2084   switch (vlen) {
2085     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2086     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2087     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2088     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2089 
2090     default: assert(false, "wrong vector length");
2091   }
2092 }
2093 
2094 void C2_MacroAssembler::reduceI(int opcode, int vlen,
2095                              Register dst, Register src1, XMMRegister src2,
2096                              XMMRegister vtmp1, XMMRegister vtmp2) {
2097   switch (vlen) {
2098     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2099     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2100     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2101     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2102 
2103     default: assert(false, "wrong vector length");
2104   }
2105 }
2106 
2107 void C2_MacroAssembler::reduceL(int opcode, int vlen,
2108                              Register dst, Register src1, XMMRegister src2,
2109                              XMMRegister vtmp1, XMMRegister vtmp2) {
2110   switch (vlen) {
2111     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2112     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2113     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2114 
2115     default: assert(false, "wrong vector length");
2116   }
2117 }
2118 
2119 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2120   switch (vlen) {
2121     case 2:
2122       assert(vtmp2 == xnoreg, "");
2123       reduce2F(opcode, dst, src, vtmp1);
2124       break;
2125     case 4:
2126       assert(vtmp2 == xnoreg, "");
2127       reduce4F(opcode, dst, src, vtmp1);
2128       break;
2129     case 8:
2130       reduce8F(opcode, dst, src, vtmp1, vtmp2);
2131       break;
2132     case 16:
2133       reduce16F(opcode, dst, src, vtmp1, vtmp2);
2134       break;
2135     default: assert(false, "wrong vector length");
2136   }
2137 }
2138 
2139 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2140   switch (vlen) {
2141     case 2:
2142       assert(vtmp2 == xnoreg, "");
2143       reduce2D(opcode, dst, src, vtmp1);
2144       break;
2145     case 4:
2146       reduce4D(opcode, dst, src, vtmp1, vtmp2);
2147       break;
2148     case 8:
2149       reduce8D(opcode, dst, src, vtmp1, vtmp2);
2150       break;
2151     default: assert(false, "wrong vector length");
2152   }
2153 }
2154 
2155 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2156   switch (vlen) {
2157     case 2:
2158       assert(vtmp1 == xnoreg, "");
2159       assert(vtmp2 == xnoreg, "");
2160       unorderedReduce2F(opcode, dst, src);
2161       break;
2162     case 4:
2163       assert(vtmp2 == xnoreg, "");
2164       unorderedReduce4F(opcode, dst, src, vtmp1);
2165       break;
2166     case 8:
2167       unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
2168       break;
2169     case 16:
2170       unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
2171       break;
2172     default: assert(false, "wrong vector length");
2173   }
2174 }
2175 
2176 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2177   switch (vlen) {
2178     case 2:
2179       assert(vtmp1 == xnoreg, "");
2180       assert(vtmp2 == xnoreg, "");
2181       unorderedReduce2D(opcode, dst, src);
2182       break;
2183     case 4:
2184       assert(vtmp2 == xnoreg, "");
2185       unorderedReduce4D(opcode, dst, src, vtmp1);
2186       break;
2187     case 8:
2188       unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2189       break;
2190     default: assert(false, "wrong vector length");
2191   }
2192 }
2193 
2194 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2195   if (opcode == Op_AddReductionVI) {
2196     if (vtmp1 != src2) {
2197       movdqu(vtmp1, src2);
2198     }
2199     phaddd(vtmp1, vtmp1);
2200   } else {
2201     pshufd(vtmp1, src2, 0x1);
2202     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2203   }
2204   movdl(vtmp2, src1);
2205   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2206   movdl(dst, vtmp1);
2207 }
2208 
2209 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2210   if (opcode == Op_AddReductionVI) {
2211     if (vtmp1 != src2) {
2212       movdqu(vtmp1, src2);
2213     }
2214     phaddd(vtmp1, src2);
2215     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2216   } else {
2217     pshufd(vtmp2, src2, 0xE);
2218     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2219     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2220   }
2221 }
2222 
2223 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2224   if (opcode == Op_AddReductionVI) {
2225     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2226     vextracti128_high(vtmp2, vtmp1);
2227     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2228     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2229   } else {
2230     vextracti128_high(vtmp1, src2);
2231     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2232     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2233   }
2234 }
2235 
2236 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2237   vextracti64x4_high(vtmp2, src2);
2238   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2239   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2240 }
2241 
2242 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2243   pshufd(vtmp2, src2, 0x1);
2244   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2245   movdqu(vtmp1, vtmp2);
2246   psrldq(vtmp1, 2);
2247   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2248   movdqu(vtmp2, vtmp1);
2249   psrldq(vtmp2, 1);
2250   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2251   movdl(vtmp2, src1);
2252   pmovsxbd(vtmp1, vtmp1);
2253   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2254   pextrb(dst, vtmp1, 0x0);
2255   movsbl(dst, dst);
2256 }
2257 
2258 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2259   pshufd(vtmp1, src2, 0xE);
2260   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2261   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2262 }
2263 
2264 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2265   vextracti128_high(vtmp2, src2);
2266   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2267   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2268 }
2269 
2270 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2271   vextracti64x4_high(vtmp1, src2);
2272   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2273   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2274 }
2275 
2276 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2277   pmovsxbw(vtmp2, src2);
2278   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2279 }
2280 
2281 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2282   if (UseAVX > 1) {
2283     int vector_len = Assembler::AVX_256bit;
2284     vpmovsxbw(vtmp1, src2, vector_len);
2285     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2286   } else {
2287     pmovsxbw(vtmp2, src2);
2288     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2289     pshufd(vtmp2, src2, 0x1);
2290     pmovsxbw(vtmp2, src2);
2291     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2292   }
2293 }
2294 
2295 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2296   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2297     int vector_len = Assembler::AVX_512bit;
2298     vpmovsxbw(vtmp1, src2, vector_len);
2299     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2300   } else {
2301     assert(UseAVX >= 2,"Should not reach here.");
2302     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2303     vextracti128_high(vtmp2, src2);
2304     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2305   }
2306 }
2307 
2308 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2309   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2310   vextracti64x4_high(vtmp2, src2);
2311   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2312 }
2313 
2314 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2315   if (opcode == Op_AddReductionVI) {
2316     if (vtmp1 != src2) {
2317       movdqu(vtmp1, src2);
2318     }
2319     phaddw(vtmp1, vtmp1);
2320     phaddw(vtmp1, vtmp1);
2321   } else {
2322     pshufd(vtmp2, src2, 0x1);
2323     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2324     movdqu(vtmp1, vtmp2);
2325     psrldq(vtmp1, 2);
2326     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2327   }
2328   movdl(vtmp2, src1);
2329   pmovsxwd(vtmp1, vtmp1);
2330   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2331   pextrw(dst, vtmp1, 0x0);
2332   movswl(dst, dst);
2333 }
2334 
2335 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2336   if (opcode == Op_AddReductionVI) {
2337     if (vtmp1 != src2) {
2338       movdqu(vtmp1, src2);
2339     }
2340     phaddw(vtmp1, src2);
2341   } else {
2342     pshufd(vtmp1, src2, 0xE);
2343     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2344   }
2345   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2346 }
2347 
2348 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2349   if (opcode == Op_AddReductionVI) {
2350     int vector_len = Assembler::AVX_256bit;
2351     vphaddw(vtmp2, src2, src2, vector_len);
2352     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2353   } else {
2354     vextracti128_high(vtmp2, src2);
2355     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2356   }
2357   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2358 }
2359 
2360 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2361   int vector_len = Assembler::AVX_256bit;
2362   vextracti64x4_high(vtmp1, src2);
2363   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2364   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2365 }
2366 
2367 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2368   pshufd(vtmp2, src2, 0xE);
2369   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2370   movdq(vtmp1, src1);
2371   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2372   movdq(dst, vtmp1);
2373 }
2374 
2375 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2376   vextracti128_high(vtmp1, src2);
2377   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2378   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2379 }
2380 
2381 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2382   vextracti64x4_high(vtmp2, src2);
2383   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2384   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2385 }
2386 
2387 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2388   mov64(temp, -1L);
2389   bzhiq(temp, temp, len);
2390   kmovql(dst, temp);
2391 }
2392 
2393 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2394   reduce_operation_128(T_FLOAT, opcode, dst, src);
2395   pshufd(vtmp, src, 0x1);
2396   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2397 }
2398 
2399 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2400   reduce2F(opcode, dst, src, vtmp);
2401   pshufd(vtmp, src, 0x2);
2402   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2403   pshufd(vtmp, src, 0x3);
2404   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2405 }
2406 
2407 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2408   reduce4F(opcode, dst, src, vtmp2);
2409   vextractf128_high(vtmp2, src);
2410   reduce4F(opcode, dst, vtmp2, vtmp1);
2411 }
2412 
2413 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2414   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2415   vextracti64x4_high(vtmp1, src);
2416   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2417 }
2418 
2419 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2420   pshufd(dst, src, 0x1);
2421   reduce_operation_128(T_FLOAT, opcode, dst, src);
2422 }
2423 
2424 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2425   pshufd(vtmp, src, 0xE);
2426   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2427   unorderedReduce2F(opcode, dst, vtmp);
2428 }
2429 
2430 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2431   vextractf128_high(vtmp1, src);
2432   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2433   unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2434 }
2435 
2436 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2437   vextractf64x4_high(vtmp2, src);
2438   unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2439   unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2440 }
2441 
2442 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2443   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2444   pshufd(vtmp, src, 0xE);
2445   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2446 }
2447 
2448 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2449   reduce2D(opcode, dst, src, vtmp2);
2450   vextractf128_high(vtmp2, src);
2451   reduce2D(opcode, dst, vtmp2, vtmp1);
2452 }
2453 
2454 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2455   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2456   vextracti64x4_high(vtmp1, src);
2457   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2458 }
2459 
2460 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2461   pshufd(dst, src, 0xE);
2462   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2463 }
2464 
2465 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2466   vextractf128_high(vtmp, src);
2467   unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2468   unorderedReduce2D(opcode, dst, vtmp);
2469 }
2470 
2471 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2472   vextractf64x4_high(vtmp2, src);
2473   unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2474   unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2475 }
2476 
2477 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2478   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2479 }
2480 
2481 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2482   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2483 }
2484 
2485 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2486   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2487 }
2488 
2489 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2490                                  int vec_enc) {
2491   switch(elem_bt) {
2492     case T_INT:
2493     case T_FLOAT:
2494       vmaskmovps(dst, src, mask, vec_enc);
2495       break;
2496     case T_LONG:
2497     case T_DOUBLE:
2498       vmaskmovpd(dst, src, mask, vec_enc);
2499       break;
2500     default:
2501       fatal("Unsupported type %s", type2name(elem_bt));
2502       break;
2503   }
2504 }
2505 
2506 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2507                                  int vec_enc) {
2508   switch(elem_bt) {
2509     case T_INT:
2510     case T_FLOAT:
2511       vmaskmovps(dst, src, mask, vec_enc);
2512       break;
2513     case T_LONG:
2514     case T_DOUBLE:
2515       vmaskmovpd(dst, src, mask, vec_enc);
2516       break;
2517     default:
2518       fatal("Unsupported type %s", type2name(elem_bt));
2519       break;
2520   }
2521 }
2522 
2523 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2524                                           XMMRegister dst, XMMRegister src,
2525                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2526                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2527   const int permconst[] = {1, 14};
2528   XMMRegister wsrc = src;
2529   XMMRegister wdst = xmm_0;
2530   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2531 
2532   int vlen_enc = Assembler::AVX_128bit;
2533   if (vlen == 16) {
2534     vlen_enc = Assembler::AVX_256bit;
2535   }
2536 
2537   for (int i = log2(vlen) - 1; i >=0; i--) {
2538     if (i == 0 && !is_dst_valid) {
2539       wdst = dst;
2540     }
2541     if (i == 3) {
2542       vextracti64x4_high(wtmp, wsrc);
2543     } else if (i == 2) {
2544       vextracti128_high(wtmp, wsrc);
2545     } else { // i = [0,1]
2546       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2547     }
2548     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2549     wsrc = wdst;
2550     vlen_enc = Assembler::AVX_128bit;
2551   }
2552   if (is_dst_valid) {
2553     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2554   }
2555 }
2556 
2557 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2558                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2559                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2560   XMMRegister wsrc = src;
2561   XMMRegister wdst = xmm_0;
2562   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2563   int vlen_enc = Assembler::AVX_128bit;
2564   if (vlen == 8) {
2565     vlen_enc = Assembler::AVX_256bit;
2566   }
2567   for (int i = log2(vlen) - 1; i >=0; i--) {
2568     if (i == 0 && !is_dst_valid) {
2569       wdst = dst;
2570     }
2571     if (i == 1) {
2572       vextracti128_high(wtmp, wsrc);
2573     } else if (i == 2) {
2574       vextracti64x4_high(wtmp, wsrc);
2575     } else {
2576       assert(i == 0, "%d", i);
2577       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2578     }
2579     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2580     wsrc = wdst;
2581     vlen_enc = Assembler::AVX_128bit;
2582   }
2583   if (is_dst_valid) {
2584     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2585   }
2586 }
2587 
2588 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2589   switch (bt) {
2590     case T_BYTE:  pextrb(dst, src, idx); break;
2591     case T_SHORT: pextrw(dst, src, idx); break;
2592     case T_INT:   pextrd(dst, src, idx); break;
2593     case T_LONG:  pextrq(dst, src, idx); break;
2594 
2595     default:
2596       assert(false,"Should not reach here.");
2597       break;
2598   }
2599 }
2600 
2601 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2602   int esize =  type2aelembytes(typ);
2603   int elem_per_lane = 16/esize;
2604   int lane = elemindex / elem_per_lane;
2605   int eindex = elemindex % elem_per_lane;
2606 
2607   if (lane >= 2) {
2608     assert(UseAVX > 2, "required");
2609     vextractf32x4(dst, src, lane & 3);
2610     return dst;
2611   } else if (lane > 0) {
2612     assert(UseAVX > 0, "required");
2613     vextractf128(dst, src, lane);
2614     return dst;
2615   } else {
2616     return src;
2617   }
2618 }
2619 
2620 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2621   if (typ == T_BYTE) {
2622     movsbl(dst, dst);
2623   } else if (typ == T_SHORT) {
2624     movswl(dst, dst);
2625   }
2626 }
2627 
2628 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2629   int esize =  type2aelembytes(typ);
2630   int elem_per_lane = 16/esize;
2631   int eindex = elemindex % elem_per_lane;
2632   assert(is_integral_type(typ),"required");
2633 
2634   if (eindex == 0) {
2635     if (typ == T_LONG) {
2636       movq(dst, src);
2637     } else {
2638       movdl(dst, src);
2639       movsxl(typ, dst);
2640     }
2641   } else {
2642     extract(typ, dst, src, eindex);
2643     movsxl(typ, dst);
2644   }
2645 }
2646 
2647 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2648   int esize =  type2aelembytes(typ);
2649   int elem_per_lane = 16/esize;
2650   int eindex = elemindex % elem_per_lane;
2651   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2652 
2653   if (eindex == 0) {
2654     movq(dst, src);
2655   } else {
2656     if (typ == T_FLOAT) {
2657       if (UseAVX == 0) {
2658         movdqu(dst, src);
2659         shufps(dst, dst, eindex);
2660       } else {
2661         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2662       }
2663     } else {
2664       if (UseAVX == 0) {
2665         movdqu(dst, src);
2666         psrldq(dst, eindex*esize);
2667       } else {
2668         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2669       }
2670       movq(dst, dst);
2671     }
2672   }
2673   // Zero upper bits
2674   if (typ == T_FLOAT) {
2675     if (UseAVX == 0) {
2676       assert(vtmp != xnoreg, "required.");
2677       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2678       pand(dst, vtmp);
2679     } else {
2680       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2681     }
2682   }
2683 }
2684 
2685 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2686   switch(typ) {
2687     case T_BYTE:
2688     case T_BOOLEAN:
2689       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2690       break;
2691     case T_SHORT:
2692     case T_CHAR:
2693       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2694       break;
2695     case T_INT:
2696     case T_FLOAT:
2697       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2698       break;
2699     case T_LONG:
2700     case T_DOUBLE:
2701       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2702       break;
2703     default:
2704       assert(false,"Should not reach here.");
2705       break;
2706   }
2707 }
2708 
2709 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2710   assert(rscratch != noreg || always_reachable(src2), "missing");
2711 
2712   switch(typ) {
2713     case T_BOOLEAN:
2714     case T_BYTE:
2715       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2716       break;
2717     case T_CHAR:
2718     case T_SHORT:
2719       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2720       break;
2721     case T_INT:
2722     case T_FLOAT:
2723       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2724       break;
2725     case T_LONG:
2726     case T_DOUBLE:
2727       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2728       break;
2729     default:
2730       assert(false,"Should not reach here.");
2731       break;
2732   }
2733 }
2734 
2735 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2736   switch(typ) {
2737     case T_BYTE:
2738       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2739       break;
2740     case T_SHORT:
2741       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2742       break;
2743     case T_INT:
2744     case T_FLOAT:
2745       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2746       break;
2747     case T_LONG:
2748     case T_DOUBLE:
2749       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2750       break;
2751     default:
2752       assert(false,"Should not reach here.");
2753       break;
2754   }
2755 }
2756 
2757 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2758   assert(vlen_in_bytes <= 32, "");
2759   int esize = type2aelembytes(bt);
2760   if (vlen_in_bytes == 32) {
2761     assert(vtmp == xnoreg, "required.");
2762     if (esize >= 4) {
2763       vtestps(src1, src2, AVX_256bit);
2764     } else {
2765       vptest(src1, src2, AVX_256bit);
2766     }
2767     return;
2768   }
2769   if (vlen_in_bytes < 16) {
2770     // Duplicate the lower part to fill the whole register,
2771     // Don't need to do so for src2
2772     assert(vtmp != xnoreg, "required");
2773     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2774     pshufd(vtmp, src1, shuffle_imm);
2775   } else {
2776     assert(vtmp == xnoreg, "required");
2777     vtmp = src1;
2778   }
2779   if (esize >= 4 && VM_Version::supports_avx()) {
2780     vtestps(vtmp, src2, AVX_128bit);
2781   } else {
2782     ptest(vtmp, src2);
2783   }
2784 }
2785 
2786 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2787 #ifdef ASSERT
2788   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2789   bool is_bw_supported = VM_Version::supports_avx512bw();
2790   if (is_bw && !is_bw_supported) {
2791     assert(vlen_enc != Assembler::AVX_512bit, "required");
2792     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2793            "XMM register should be 0-15");
2794   }
2795 #endif // ASSERT
2796   switch (elem_bt) {
2797     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2798     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2799     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2800     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2801     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2802     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2803     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2804   }
2805 }
2806 
2807 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2808   assert(UseAVX >= 2, "required");
2809   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2810   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2811   if ((UseAVX > 2) &&
2812       (!is_bw || VM_Version::supports_avx512bw()) &&
2813       (!is_vl || VM_Version::supports_avx512vl())) {
2814     switch (elem_bt) {
2815       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2816       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2817       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2818       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2819       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2820     }
2821   } else {
2822     assert(vlen_enc != Assembler::AVX_512bit, "required");
2823     assert((dst->encoding() < 16),"XMM register should be 0-15");
2824     switch (elem_bt) {
2825       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2826       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2827       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2828       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2829       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2830       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2831       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2832     }
2833   }
2834 }
2835 
2836 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2837   switch (to_elem_bt) {
2838     case T_SHORT:
2839       vpmovsxbw(dst, src, vlen_enc);
2840       break;
2841     case T_INT:
2842       vpmovsxbd(dst, src, vlen_enc);
2843       break;
2844     case T_FLOAT:
2845       vpmovsxbd(dst, src, vlen_enc);
2846       vcvtdq2ps(dst, dst, vlen_enc);
2847       break;
2848     case T_LONG:
2849       vpmovsxbq(dst, src, vlen_enc);
2850       break;
2851     case T_DOUBLE: {
2852       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2853       vpmovsxbd(dst, src, mid_vlen_enc);
2854       vcvtdq2pd(dst, dst, vlen_enc);
2855       break;
2856     }
2857     default:
2858       fatal("Unsupported type %s", type2name(to_elem_bt));
2859       break;
2860   }
2861 }
2862 
2863 //-------------------------------------------------------------------------------------------
2864 
2865 // IndexOf for constant substrings with size >= 8 chars
2866 // which don't need to be loaded through stack.
2867 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2868                                          Register cnt1, Register cnt2,
2869                                          int int_cnt2,  Register result,
2870                                          XMMRegister vec, Register tmp,
2871                                          int ae) {
2872   ShortBranchVerifier sbv(this);
2873   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2874   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2875 
2876   // This method uses the pcmpestri instruction with bound registers
2877   //   inputs:
2878   //     xmm - substring
2879   //     rax - substring length (elements count)
2880   //     mem - scanned string
2881   //     rdx - string length (elements count)
2882   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2883   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2884   //   outputs:
2885   //     rcx - matched index in string
2886   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2887   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2888   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2889   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2890   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2891 
2892   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2893         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2894         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2895 
2896   // Note, inline_string_indexOf() generates checks:
2897   // if (substr.count > string.count) return -1;
2898   // if (substr.count == 0) return 0;
2899   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2900 
2901   // Load substring.
2902   if (ae == StrIntrinsicNode::UL) {
2903     pmovzxbw(vec, Address(str2, 0));
2904   } else {
2905     movdqu(vec, Address(str2, 0));
2906   }
2907   movl(cnt2, int_cnt2);
2908   movptr(result, str1); // string addr
2909 
2910   if (int_cnt2 > stride) {
2911     jmpb(SCAN_TO_SUBSTR);
2912 
2913     // Reload substr for rescan, this code
2914     // is executed only for large substrings (> 8 chars)
2915     bind(RELOAD_SUBSTR);
2916     if (ae == StrIntrinsicNode::UL) {
2917       pmovzxbw(vec, Address(str2, 0));
2918     } else {
2919       movdqu(vec, Address(str2, 0));
2920     }
2921     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2922 
2923     bind(RELOAD_STR);
2924     // We came here after the beginning of the substring was
2925     // matched but the rest of it was not so we need to search
2926     // again. Start from the next element after the previous match.
2927 
2928     // cnt2 is number of substring reminding elements and
2929     // cnt1 is number of string reminding elements when cmp failed.
2930     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2931     subl(cnt1, cnt2);
2932     addl(cnt1, int_cnt2);
2933     movl(cnt2, int_cnt2); // Now restore cnt2
2934 
2935     decrementl(cnt1);     // Shift to next element
2936     cmpl(cnt1, cnt2);
2937     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2938 
2939     addptr(result, (1<<scale1));
2940 
2941   } // (int_cnt2 > 8)
2942 
2943   // Scan string for start of substr in 16-byte vectors
2944   bind(SCAN_TO_SUBSTR);
2945   pcmpestri(vec, Address(result, 0), mode);
2946   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2947   subl(cnt1, stride);
2948   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2949   cmpl(cnt1, cnt2);
2950   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2951   addptr(result, 16);
2952   jmpb(SCAN_TO_SUBSTR);
2953 
2954   // Found a potential substr
2955   bind(FOUND_CANDIDATE);
2956   // Matched whole vector if first element matched (tmp(rcx) == 0).
2957   if (int_cnt2 == stride) {
2958     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2959   } else { // int_cnt2 > 8
2960     jccb(Assembler::overflow, FOUND_SUBSTR);
2961   }
2962   // After pcmpestri tmp(rcx) contains matched element index
2963   // Compute start addr of substr
2964   lea(result, Address(result, tmp, scale1));
2965 
2966   // Make sure string is still long enough
2967   subl(cnt1, tmp);
2968   cmpl(cnt1, cnt2);
2969   if (int_cnt2 == stride) {
2970     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2971   } else { // int_cnt2 > 8
2972     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2973   }
2974   // Left less then substring.
2975 
2976   bind(RET_NOT_FOUND);
2977   movl(result, -1);
2978   jmp(EXIT);
2979 
2980   if (int_cnt2 > stride) {
2981     // This code is optimized for the case when whole substring
2982     // is matched if its head is matched.
2983     bind(MATCH_SUBSTR_HEAD);
2984     pcmpestri(vec, Address(result, 0), mode);
2985     // Reload only string if does not match
2986     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2987 
2988     Label CONT_SCAN_SUBSTR;
2989     // Compare the rest of substring (> 8 chars).
2990     bind(FOUND_SUBSTR);
2991     // First 8 chars are already matched.
2992     negptr(cnt2);
2993     addptr(cnt2, stride);
2994 
2995     bind(SCAN_SUBSTR);
2996     subl(cnt1, stride);
2997     cmpl(cnt2, -stride); // Do not read beyond substring
2998     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2999     // Back-up strings to avoid reading beyond substring:
3000     // cnt1 = cnt1 - cnt2 + 8
3001     addl(cnt1, cnt2); // cnt2 is negative
3002     addl(cnt1, stride);
3003     movl(cnt2, stride); negptr(cnt2);
3004     bind(CONT_SCAN_SUBSTR);
3005     if (int_cnt2 < (int)G) {
3006       int tail_off1 = int_cnt2<<scale1;
3007       int tail_off2 = int_cnt2<<scale2;
3008       if (ae == StrIntrinsicNode::UL) {
3009         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
3010       } else {
3011         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
3012       }
3013       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
3014     } else {
3015       // calculate index in register to avoid integer overflow (int_cnt2*2)
3016       movl(tmp, int_cnt2);
3017       addptr(tmp, cnt2);
3018       if (ae == StrIntrinsicNode::UL) {
3019         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
3020       } else {
3021         movdqu(vec, Address(str2, tmp, scale2, 0));
3022       }
3023       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
3024     }
3025     // Need to reload strings pointers if not matched whole vector
3026     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3027     addptr(cnt2, stride);
3028     jcc(Assembler::negative, SCAN_SUBSTR);
3029     // Fall through if found full substring
3030 
3031   } // (int_cnt2 > 8)
3032 
3033   bind(RET_FOUND);
3034   // Found result if we matched full small substring.
3035   // Compute substr offset
3036   subptr(result, str1);
3037   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3038     shrl(result, 1); // index
3039   }
3040   bind(EXIT);
3041 
3042 } // string_indexofC8
3043 
3044 // Small strings are loaded through stack if they cross page boundary.
3045 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
3046                                        Register cnt1, Register cnt2,
3047                                        int int_cnt2,  Register result,
3048                                        XMMRegister vec, Register tmp,
3049                                        int ae) {
3050   ShortBranchVerifier sbv(this);
3051   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3052   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
3053 
3054   //
3055   // int_cnt2 is length of small (< 8 chars) constant substring
3056   // or (-1) for non constant substring in which case its length
3057   // is in cnt2 register.
3058   //
3059   // Note, inline_string_indexOf() generates checks:
3060   // if (substr.count > string.count) return -1;
3061   // if (substr.count == 0) return 0;
3062   //
3063   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
3064   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
3065   // This method uses the pcmpestri instruction with bound registers
3066   //   inputs:
3067   //     xmm - substring
3068   //     rax - substring length (elements count)
3069   //     mem - scanned string
3070   //     rdx - string length (elements count)
3071   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
3072   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
3073   //   outputs:
3074   //     rcx - matched index in string
3075   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3076   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
3077   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
3078   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
3079 
3080   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
3081         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
3082         FOUND_CANDIDATE;
3083 
3084   { //========================================================
3085     // We don't know where these strings are located
3086     // and we can't read beyond them. Load them through stack.
3087     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
3088 
3089     movptr(tmp, rsp); // save old SP
3090 
3091     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
3092       if (int_cnt2 == (1>>scale2)) { // One byte
3093         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
3094         load_unsigned_byte(result, Address(str2, 0));
3095         movdl(vec, result); // move 32 bits
3096       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
3097         // Not enough header space in 32-bit VM: 12+3 = 15.
3098         movl(result, Address(str2, -1));
3099         shrl(result, 8);
3100         movdl(vec, result); // move 32 bits
3101       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
3102         load_unsigned_short(result, Address(str2, 0));
3103         movdl(vec, result); // move 32 bits
3104       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
3105         movdl(vec, Address(str2, 0)); // move 32 bits
3106       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
3107         movq(vec, Address(str2, 0));  // move 64 bits
3108       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
3109         // Array header size is 12 bytes in 32-bit VM
3110         // + 6 bytes for 3 chars == 18 bytes,
3111         // enough space to load vec and shift.
3112         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
3113         if (ae == StrIntrinsicNode::UL) {
3114           int tail_off = int_cnt2-8;
3115           pmovzxbw(vec, Address(str2, tail_off));
3116           psrldq(vec, -2*tail_off);
3117         }
3118         else {
3119           int tail_off = int_cnt2*(1<<scale2);
3120           movdqu(vec, Address(str2, tail_off-16));
3121           psrldq(vec, 16-tail_off);
3122         }
3123       }
3124     } else { // not constant substring
3125       cmpl(cnt2, stride);
3126       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3127 
3128       // We can read beyond string if srt+16 does not cross page boundary
3129       // since heaps are aligned and mapped by pages.
3130       assert(os::vm_page_size() < (int)G, "default page should be small");
3131       movl(result, str2); // We need only low 32 bits
3132       andl(result, ((int)os::vm_page_size()-1));
3133       cmpl(result, ((int)os::vm_page_size()-16));
3134       jccb(Assembler::belowEqual, CHECK_STR);
3135 
3136       // Move small strings to stack to allow load 16 bytes into vec.
3137       subptr(rsp, 16);
3138       int stk_offset = wordSize-(1<<scale2);
3139       push(cnt2);
3140 
3141       bind(COPY_SUBSTR);
3142       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3143         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3144         movb(Address(rsp, cnt2, scale2, stk_offset), result);
3145       } else if (ae == StrIntrinsicNode::UU) {
3146         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3147         movw(Address(rsp, cnt2, scale2, stk_offset), result);
3148       }
3149       decrement(cnt2);
3150       jccb(Assembler::notZero, COPY_SUBSTR);
3151 
3152       pop(cnt2);
3153       movptr(str2, rsp);  // New substring address
3154     } // non constant
3155 
3156     bind(CHECK_STR);
3157     cmpl(cnt1, stride);
3158     jccb(Assembler::aboveEqual, BIG_STRINGS);
3159 
3160     // Check cross page boundary.
3161     movl(result, str1); // We need only low 32 bits
3162     andl(result, ((int)os::vm_page_size()-1));
3163     cmpl(result, ((int)os::vm_page_size()-16));
3164     jccb(Assembler::belowEqual, BIG_STRINGS);
3165 
3166     subptr(rsp, 16);
3167     int stk_offset = -(1<<scale1);
3168     if (int_cnt2 < 0) { // not constant
3169       push(cnt2);
3170       stk_offset += wordSize;
3171     }
3172     movl(cnt2, cnt1);
3173 
3174     bind(COPY_STR);
3175     if (ae == StrIntrinsicNode::LL) {
3176       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3177       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3178     } else {
3179       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3180       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3181     }
3182     decrement(cnt2);
3183     jccb(Assembler::notZero, COPY_STR);
3184 
3185     if (int_cnt2 < 0) { // not constant
3186       pop(cnt2);
3187     }
3188     movptr(str1, rsp);  // New string address
3189 
3190     bind(BIG_STRINGS);
3191     // Load substring.
3192     if (int_cnt2 < 0) { // -1
3193       if (ae == StrIntrinsicNode::UL) {
3194         pmovzxbw(vec, Address(str2, 0));
3195       } else {
3196         movdqu(vec, Address(str2, 0));
3197       }
3198       push(cnt2);       // substr count
3199       push(str2);       // substr addr
3200       push(str1);       // string addr
3201     } else {
3202       // Small (< 8 chars) constant substrings are loaded already.
3203       movl(cnt2, int_cnt2);
3204     }
3205     push(tmp);  // original SP
3206 
3207   } // Finished loading
3208 
3209   //========================================================
3210   // Start search
3211   //
3212 
3213   movptr(result, str1); // string addr
3214 
3215   if (int_cnt2  < 0) {  // Only for non constant substring
3216     jmpb(SCAN_TO_SUBSTR);
3217 
3218     // SP saved at sp+0
3219     // String saved at sp+1*wordSize
3220     // Substr saved at sp+2*wordSize
3221     // Substr count saved at sp+3*wordSize
3222 
3223     // Reload substr for rescan, this code
3224     // is executed only for large substrings (> 8 chars)
3225     bind(RELOAD_SUBSTR);
3226     movptr(str2, Address(rsp, 2*wordSize));
3227     movl(cnt2, Address(rsp, 3*wordSize));
3228     if (ae == StrIntrinsicNode::UL) {
3229       pmovzxbw(vec, Address(str2, 0));
3230     } else {
3231       movdqu(vec, Address(str2, 0));
3232     }
3233     // We came here after the beginning of the substring was
3234     // matched but the rest of it was not so we need to search
3235     // again. Start from the next element after the previous match.
3236     subptr(str1, result); // Restore counter
3237     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3238       shrl(str1, 1);
3239     }
3240     addl(cnt1, str1);
3241     decrementl(cnt1);   // Shift to next element
3242     cmpl(cnt1, cnt2);
3243     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3244 
3245     addptr(result, (1<<scale1));
3246   } // non constant
3247 
3248   // Scan string for start of substr in 16-byte vectors
3249   bind(SCAN_TO_SUBSTR);
3250   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3251   pcmpestri(vec, Address(result, 0), mode);
3252   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3253   subl(cnt1, stride);
3254   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3255   cmpl(cnt1, cnt2);
3256   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3257   addptr(result, 16);
3258 
3259   bind(ADJUST_STR);
3260   cmpl(cnt1, stride); // Do not read beyond string
3261   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3262   // Back-up string to avoid reading beyond string.
3263   lea(result, Address(result, cnt1, scale1, -16));
3264   movl(cnt1, stride);
3265   jmpb(SCAN_TO_SUBSTR);
3266 
3267   // Found a potential substr
3268   bind(FOUND_CANDIDATE);
3269   // After pcmpestri tmp(rcx) contains matched element index
3270 
3271   // Make sure string is still long enough
3272   subl(cnt1, tmp);
3273   cmpl(cnt1, cnt2);
3274   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3275   // Left less then substring.
3276 
3277   bind(RET_NOT_FOUND);
3278   movl(result, -1);
3279   jmp(CLEANUP);
3280 
3281   bind(FOUND_SUBSTR);
3282   // Compute start addr of substr
3283   lea(result, Address(result, tmp, scale1));
3284   if (int_cnt2 > 0) { // Constant substring
3285     // Repeat search for small substring (< 8 chars)
3286     // from new point without reloading substring.
3287     // Have to check that we don't read beyond string.
3288     cmpl(tmp, stride-int_cnt2);
3289     jccb(Assembler::greater, ADJUST_STR);
3290     // Fall through if matched whole substring.
3291   } else { // non constant
3292     assert(int_cnt2 == -1, "should be != 0");
3293 
3294     addl(tmp, cnt2);
3295     // Found result if we matched whole substring.
3296     cmpl(tmp, stride);
3297     jcc(Assembler::lessEqual, RET_FOUND);
3298 
3299     // Repeat search for small substring (<= 8 chars)
3300     // from new point 'str1' without reloading substring.
3301     cmpl(cnt2, stride);
3302     // Have to check that we don't read beyond string.
3303     jccb(Assembler::lessEqual, ADJUST_STR);
3304 
3305     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3306     // Compare the rest of substring (> 8 chars).
3307     movptr(str1, result);
3308 
3309     cmpl(tmp, cnt2);
3310     // First 8 chars are already matched.
3311     jccb(Assembler::equal, CHECK_NEXT);
3312 
3313     bind(SCAN_SUBSTR);
3314     pcmpestri(vec, Address(str1, 0), mode);
3315     // Need to reload strings pointers if not matched whole vector
3316     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3317 
3318     bind(CHECK_NEXT);
3319     subl(cnt2, stride);
3320     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3321     addptr(str1, 16);
3322     if (ae == StrIntrinsicNode::UL) {
3323       addptr(str2, 8);
3324     } else {
3325       addptr(str2, 16);
3326     }
3327     subl(cnt1, stride);
3328     cmpl(cnt2, stride); // Do not read beyond substring
3329     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3330     // Back-up strings to avoid reading beyond substring.
3331 
3332     if (ae == StrIntrinsicNode::UL) {
3333       lea(str2, Address(str2, cnt2, scale2, -8));
3334       lea(str1, Address(str1, cnt2, scale1, -16));
3335     } else {
3336       lea(str2, Address(str2, cnt2, scale2, -16));
3337       lea(str1, Address(str1, cnt2, scale1, -16));
3338     }
3339     subl(cnt1, cnt2);
3340     movl(cnt2, stride);
3341     addl(cnt1, stride);
3342     bind(CONT_SCAN_SUBSTR);
3343     if (ae == StrIntrinsicNode::UL) {
3344       pmovzxbw(vec, Address(str2, 0));
3345     } else {
3346       movdqu(vec, Address(str2, 0));
3347     }
3348     jmp(SCAN_SUBSTR);
3349 
3350     bind(RET_FOUND_LONG);
3351     movptr(str1, Address(rsp, wordSize));
3352   } // non constant
3353 
3354   bind(RET_FOUND);
3355   // Compute substr offset
3356   subptr(result, str1);
3357   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3358     shrl(result, 1); // index
3359   }
3360   bind(CLEANUP);
3361   pop(rsp); // restore SP
3362 
3363 } // string_indexof
3364 
3365 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3366                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3367   ShortBranchVerifier sbv(this);
3368   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3369 
3370   int stride = 8;
3371 
3372   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3373         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3374         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3375         FOUND_SEQ_CHAR, DONE_LABEL;
3376 
3377   movptr(result, str1);
3378   if (UseAVX >= 2) {
3379     cmpl(cnt1, stride);
3380     jcc(Assembler::less, SCAN_TO_CHAR);
3381     cmpl(cnt1, 2*stride);
3382     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3383     movdl(vec1, ch);
3384     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3385     vpxor(vec2, vec2);
3386     movl(tmp, cnt1);
3387     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3388     andl(cnt1,0x0000000F);  //tail count (in chars)
3389 
3390     bind(SCAN_TO_16_CHAR_LOOP);
3391     vmovdqu(vec3, Address(result, 0));
3392     vpcmpeqw(vec3, vec3, vec1, 1);
3393     vptest(vec2, vec3);
3394     jcc(Assembler::carryClear, FOUND_CHAR);
3395     addptr(result, 32);
3396     subl(tmp, 2*stride);
3397     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3398     jmp(SCAN_TO_8_CHAR);
3399     bind(SCAN_TO_8_CHAR_INIT);
3400     movdl(vec1, ch);
3401     pshuflw(vec1, vec1, 0x00);
3402     pshufd(vec1, vec1, 0);
3403     pxor(vec2, vec2);
3404   }
3405   bind(SCAN_TO_8_CHAR);
3406   cmpl(cnt1, stride);
3407   jcc(Assembler::less, SCAN_TO_CHAR);
3408   if (UseAVX < 2) {
3409     movdl(vec1, ch);
3410     pshuflw(vec1, vec1, 0x00);
3411     pshufd(vec1, vec1, 0);
3412     pxor(vec2, vec2);
3413   }
3414   movl(tmp, cnt1);
3415   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3416   andl(cnt1,0x00000007);  //tail count (in chars)
3417 
3418   bind(SCAN_TO_8_CHAR_LOOP);
3419   movdqu(vec3, Address(result, 0));
3420   pcmpeqw(vec3, vec1);
3421   ptest(vec2, vec3);
3422   jcc(Assembler::carryClear, FOUND_CHAR);
3423   addptr(result, 16);
3424   subl(tmp, stride);
3425   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3426   bind(SCAN_TO_CHAR);
3427   testl(cnt1, cnt1);
3428   jcc(Assembler::zero, RET_NOT_FOUND);
3429   bind(SCAN_TO_CHAR_LOOP);
3430   load_unsigned_short(tmp, Address(result, 0));
3431   cmpl(ch, tmp);
3432   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3433   addptr(result, 2);
3434   subl(cnt1, 1);
3435   jccb(Assembler::zero, RET_NOT_FOUND);
3436   jmp(SCAN_TO_CHAR_LOOP);
3437 
3438   bind(RET_NOT_FOUND);
3439   movl(result, -1);
3440   jmpb(DONE_LABEL);
3441 
3442   bind(FOUND_CHAR);
3443   if (UseAVX >= 2) {
3444     vpmovmskb(tmp, vec3);
3445   } else {
3446     pmovmskb(tmp, vec3);
3447   }
3448   bsfl(ch, tmp);
3449   addptr(result, ch);
3450 
3451   bind(FOUND_SEQ_CHAR);
3452   subptr(result, str1);
3453   shrl(result, 1);
3454 
3455   bind(DONE_LABEL);
3456 } // string_indexof_char
3457 
3458 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3459                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3460   ShortBranchVerifier sbv(this);
3461   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3462 
3463   int stride = 16;
3464 
3465   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3466         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3467         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3468         FOUND_SEQ_CHAR, DONE_LABEL;
3469 
3470   movptr(result, str1);
3471   if (UseAVX >= 2) {
3472     cmpl(cnt1, stride);
3473     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3474     cmpl(cnt1, stride*2);
3475     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3476     movdl(vec1, ch);
3477     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3478     vpxor(vec2, vec2);
3479     movl(tmp, cnt1);
3480     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3481     andl(cnt1,0x0000001F);  //tail count (in chars)
3482 
3483     bind(SCAN_TO_32_CHAR_LOOP);
3484     vmovdqu(vec3, Address(result, 0));
3485     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3486     vptest(vec2, vec3);
3487     jcc(Assembler::carryClear, FOUND_CHAR);
3488     addptr(result, 32);
3489     subl(tmp, stride*2);
3490     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3491     jmp(SCAN_TO_16_CHAR);
3492 
3493     bind(SCAN_TO_16_CHAR_INIT);
3494     movdl(vec1, ch);
3495     pxor(vec2, vec2);
3496     pshufb(vec1, vec2);
3497   }
3498 
3499   bind(SCAN_TO_16_CHAR);
3500   cmpl(cnt1, stride);
3501   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3502   if (UseAVX < 2) {
3503     movdl(vec1, ch);
3504     pxor(vec2, vec2);
3505     pshufb(vec1, vec2);
3506   }
3507   movl(tmp, cnt1);
3508   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3509   andl(cnt1,0x0000000F);  //tail count (in bytes)
3510 
3511   bind(SCAN_TO_16_CHAR_LOOP);
3512   movdqu(vec3, Address(result, 0));
3513   pcmpeqb(vec3, vec1);
3514   ptest(vec2, vec3);
3515   jcc(Assembler::carryClear, FOUND_CHAR);
3516   addptr(result, 16);
3517   subl(tmp, stride);
3518   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3519 
3520   bind(SCAN_TO_CHAR_INIT);
3521   testl(cnt1, cnt1);
3522   jcc(Assembler::zero, RET_NOT_FOUND);
3523   bind(SCAN_TO_CHAR_LOOP);
3524   load_unsigned_byte(tmp, Address(result, 0));
3525   cmpl(ch, tmp);
3526   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3527   addptr(result, 1);
3528   subl(cnt1, 1);
3529   jccb(Assembler::zero, RET_NOT_FOUND);
3530   jmp(SCAN_TO_CHAR_LOOP);
3531 
3532   bind(RET_NOT_FOUND);
3533   movl(result, -1);
3534   jmpb(DONE_LABEL);
3535 
3536   bind(FOUND_CHAR);
3537   if (UseAVX >= 2) {
3538     vpmovmskb(tmp, vec3);
3539   } else {
3540     pmovmskb(tmp, vec3);
3541   }
3542   bsfl(ch, tmp);
3543   addptr(result, ch);
3544 
3545   bind(FOUND_SEQ_CHAR);
3546   subptr(result, str1);
3547 
3548   bind(DONE_LABEL);
3549 } // stringL_indexof_char
3550 
3551 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3552   switch (eltype) {
3553   case T_BOOLEAN: return sizeof(jboolean);
3554   case T_BYTE:  return sizeof(jbyte);
3555   case T_SHORT: return sizeof(jshort);
3556   case T_CHAR:  return sizeof(jchar);
3557   case T_INT:   return sizeof(jint);
3558   default:
3559     ShouldNotReachHere();
3560     return -1;
3561   }
3562 }
3563 
3564 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3565   switch (eltype) {
3566   // T_BOOLEAN used as surrogate for unsigned byte
3567   case T_BOOLEAN: movzbl(dst, src);   break;
3568   case T_BYTE:    movsbl(dst, src);   break;
3569   case T_SHORT:   movswl(dst, src);   break;
3570   case T_CHAR:    movzwl(dst, src);   break;
3571   case T_INT:     movl(dst, src);     break;
3572   default:
3573     ShouldNotReachHere();
3574   }
3575 }
3576 
3577 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3578   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3579 }
3580 
3581 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3582   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3583 }
3584 
3585 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3586   const int vlen = Assembler::AVX_256bit;
3587   switch (eltype) {
3588   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3589   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3590   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3591   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3592   case T_INT:
3593     // do nothing
3594     break;
3595   default:
3596     ShouldNotReachHere();
3597   }
3598 }
3599 
3600 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3601                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3602                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3603                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3604                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3605                                         BasicType eltype) {
3606   ShortBranchVerifier sbv(this);
3607   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3608   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3609   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3610 
3611   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3612         SHORT_UNROLLED_LOOP_EXIT,
3613         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3614         UNROLLED_VECTOR_LOOP_BEGIN,
3615         END;
3616   switch (eltype) {
3617   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3618   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3619   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3620   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3621   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3622   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3623   }
3624 
3625   // For "renaming" for readibility of the code
3626   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3627                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3628                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3629 
3630   const int elsize = arrays_hashcode_elsize(eltype);
3631 
3632   /*
3633     if (cnt1 >= 2) {
3634       if (cnt1 >= 32) {
3635         UNROLLED VECTOR LOOP
3636       }
3637       UNROLLED SCALAR LOOP
3638     }
3639     SINGLE SCALAR
3640    */
3641 
3642   cmpl(cnt1, 32);
3643   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3644 
3645   // cnt1 >= 32 && generate_vectorized_loop
3646   xorl(index, index);
3647 
3648   // vresult = IntVector.zero(I256);
3649   for (int idx = 0; idx < 4; idx++) {
3650     vpxor(vresult[idx], vresult[idx]);
3651   }
3652   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3653   Register bound = tmp2;
3654   Register next = tmp3;
3655   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3656   movl(next, Address(tmp2, 0));
3657   movdl(vnext, next);
3658   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3659 
3660   // index = 0;
3661   // bound = cnt1 & ~(32 - 1);
3662   movl(bound, cnt1);
3663   andl(bound, ~(32 - 1));
3664   // for (; index < bound; index += 32) {
3665   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3666   // result *= next;
3667   imull(result, next);
3668   // loop fission to upfront the cost of fetching from memory, OOO execution
3669   // can then hopefully do a better job of prefetching
3670   for (int idx = 0; idx < 4; idx++) {
3671     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3672   }
3673   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3674   for (int idx = 0; idx < 4; idx++) {
3675     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3676     arrays_hashcode_elvcast(vtmp[idx], eltype);
3677     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3678   }
3679   // index += 32;
3680   addl(index, 32);
3681   // index < bound;
3682   cmpl(index, bound);
3683   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3684   // }
3685 
3686   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3687   subl(cnt1, bound);
3688   // release bound
3689 
3690   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3691   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3692   for (int idx = 0; idx < 4; idx++) {
3693     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, (int)((8 * idx + 1) * sizeof(jint))), T_INT);
3694     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3695   }
3696   // result += vresult.reduceLanes(ADD);
3697   for (int idx = 0; idx < 4; idx++) {
3698     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3699   }
3700 
3701   // } else if (cnt1 < 32) {
3702 
3703   bind(SHORT_UNROLLED_BEGIN);
3704   // int i = 1;
3705   movl(index, 1);
3706   cmpl(index, cnt1);
3707   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3708 
3709   // for (; i < cnt1 ; i += 2) {
3710   bind(SHORT_UNROLLED_LOOP_BEGIN);
3711   movl(tmp3, 961);
3712   imull(result, tmp3);
3713   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3714   movl(tmp3, tmp2);
3715   shll(tmp3, 5);
3716   subl(tmp3, tmp2);
3717   addl(result, tmp3);
3718   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3719   addl(result, tmp3);
3720   addl(index, 2);
3721   cmpl(index, cnt1);
3722   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3723 
3724   // }
3725   // if (i >= cnt1) {
3726   bind(SHORT_UNROLLED_LOOP_EXIT);
3727   jccb(Assembler::greater, END);
3728   movl(tmp2, result);
3729   shll(result, 5);
3730   subl(result, tmp2);
3731   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3732   addl(result, tmp3);
3733   // }
3734   bind(END);
3735 
3736   BLOCK_COMMENT("} // arrays_hashcode");
3737 
3738 } // arrays_hashcode
3739 
3740 // helper function for string_compare
3741 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3742                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3743                                            Address::ScaleFactor scale2, Register index, int ae) {
3744   if (ae == StrIntrinsicNode::LL) {
3745     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3746     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3747   } else if (ae == StrIntrinsicNode::UU) {
3748     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3749     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3750   } else {
3751     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3752     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3753   }
3754 }
3755 
3756 // Compare strings, used for char[] and byte[].
3757 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3758                                        Register cnt1, Register cnt2, Register result,
3759                                        XMMRegister vec1, int ae, KRegister mask) {
3760   ShortBranchVerifier sbv(this);
3761   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3762   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only AVX3
3763   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3764   int stride2x2 = 0x40;
3765   Address::ScaleFactor scale = Address::no_scale;
3766   Address::ScaleFactor scale1 = Address::no_scale;
3767   Address::ScaleFactor scale2 = Address::no_scale;
3768 
3769   if (ae != StrIntrinsicNode::LL) {
3770     stride2x2 = 0x20;
3771   }
3772 
3773   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3774     shrl(cnt2, 1);
3775   }
3776   // Compute the minimum of the string lengths and the
3777   // difference of the string lengths (stack).
3778   // Do the conditional move stuff
3779   movl(result, cnt1);
3780   subl(cnt1, cnt2);
3781   push(cnt1);
3782   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3783 
3784   // Is the minimum length zero?
3785   testl(cnt2, cnt2);
3786   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3787   if (ae == StrIntrinsicNode::LL) {
3788     // Load first bytes
3789     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3790     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3791   } else if (ae == StrIntrinsicNode::UU) {
3792     // Load first characters
3793     load_unsigned_short(result, Address(str1, 0));
3794     load_unsigned_short(cnt1, Address(str2, 0));
3795   } else {
3796     load_unsigned_byte(result, Address(str1, 0));
3797     load_unsigned_short(cnt1, Address(str2, 0));
3798   }
3799   subl(result, cnt1);
3800   jcc(Assembler::notZero,  POP_LABEL);
3801 
3802   if (ae == StrIntrinsicNode::UU) {
3803     // Divide length by 2 to get number of chars
3804     shrl(cnt2, 1);
3805   }
3806   cmpl(cnt2, 1);
3807   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3808 
3809   // Check if the strings start at the same location and setup scale and stride
3810   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3811     cmpptr(str1, str2);
3812     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3813     if (ae == StrIntrinsicNode::LL) {
3814       scale = Address::times_1;
3815       stride = 16;
3816     } else {
3817       scale = Address::times_2;
3818       stride = 8;
3819     }
3820   } else {
3821     scale1 = Address::times_1;
3822     scale2 = Address::times_2;
3823     // scale not used
3824     stride = 8;
3825   }
3826 
3827   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3828     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3829     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3830     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3831     Label COMPARE_TAIL_LONG;
3832     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only AVX3
3833 
3834     int pcmpmask = 0x19;
3835     if (ae == StrIntrinsicNode::LL) {
3836       pcmpmask &= ~0x01;
3837     }
3838 
3839     // Setup to compare 16-chars (32-bytes) vectors,
3840     // start from first character again because it has aligned address.
3841     if (ae == StrIntrinsicNode::LL) {
3842       stride2 = 32;
3843     } else {
3844       stride2 = 16;
3845     }
3846     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3847       adr_stride = stride << scale;
3848     } else {
3849       adr_stride1 = 8;  //stride << scale1;
3850       adr_stride2 = 16; //stride << scale2;
3851     }
3852 
3853     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3854     // rax and rdx are used by pcmpestri as elements counters
3855     movl(result, cnt2);
3856     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3857     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3858 
3859     // fast path : compare first 2 8-char vectors.
3860     bind(COMPARE_16_CHARS);
3861     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3862       movdqu(vec1, Address(str1, 0));
3863     } else {
3864       pmovzxbw(vec1, Address(str1, 0));
3865     }
3866     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3867     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3868 
3869     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3870       movdqu(vec1, Address(str1, adr_stride));
3871       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3872     } else {
3873       pmovzxbw(vec1, Address(str1, adr_stride1));
3874       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3875     }
3876     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3877     addl(cnt1, stride);
3878 
3879     // Compare the characters at index in cnt1
3880     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3881     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3882     subl(result, cnt2);
3883     jmp(POP_LABEL);
3884 
3885     // Setup the registers to start vector comparison loop
3886     bind(COMPARE_WIDE_VECTORS);
3887     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3888       lea(str1, Address(str1, result, scale));
3889       lea(str2, Address(str2, result, scale));
3890     } else {
3891       lea(str1, Address(str1, result, scale1));
3892       lea(str2, Address(str2, result, scale2));
3893     }
3894     subl(result, stride2);
3895     subl(cnt2, stride2);
3896     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3897     negptr(result);
3898 
3899     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3900     bind(COMPARE_WIDE_VECTORS_LOOP);
3901 
3902     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3903       cmpl(cnt2, stride2x2);
3904       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3905       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3906       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3907 
3908       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3909       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3910         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3911         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3912       } else {
3913         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3914         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3915       }
3916       kortestql(mask, mask);
3917       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3918       addptr(result, stride2x2);  // update since we already compared at this addr
3919       subl(cnt2, stride2x2);      // and sub the size too
3920       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3921 
3922       vpxor(vec1, vec1);
3923       jmpb(COMPARE_WIDE_TAIL);
3924     }//if (VM_Version::supports_avx512vlbw())
3925 
3926     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3927     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3928       vmovdqu(vec1, Address(str1, result, scale));
3929       vpxor(vec1, Address(str2, result, scale));
3930     } else {
3931       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3932       vpxor(vec1, Address(str2, result, scale2));
3933     }
3934     vptest(vec1, vec1);
3935     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3936     addptr(result, stride2);
3937     subl(cnt2, stride2);
3938     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3939     // clean upper bits of YMM registers
3940     vpxor(vec1, vec1);
3941 
3942     // compare wide vectors tail
3943     bind(COMPARE_WIDE_TAIL);
3944     testptr(result, result);
3945     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3946 
3947     movl(result, stride2);
3948     movl(cnt2, result);
3949     negptr(result);
3950     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3951 
3952     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3953     bind(VECTOR_NOT_EQUAL);
3954     // clean upper bits of YMM registers
3955     vpxor(vec1, vec1);
3956     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3957       lea(str1, Address(str1, result, scale));
3958       lea(str2, Address(str2, result, scale));
3959     } else {
3960       lea(str1, Address(str1, result, scale1));
3961       lea(str2, Address(str2, result, scale2));
3962     }
3963     jmp(COMPARE_16_CHARS);
3964 
3965     // Compare tail chars, length between 1 to 15 chars
3966     bind(COMPARE_TAIL_LONG);
3967     movl(cnt2, result);
3968     cmpl(cnt2, stride);
3969     jcc(Assembler::less, COMPARE_SMALL_STR);
3970 
3971     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3972       movdqu(vec1, Address(str1, 0));
3973     } else {
3974       pmovzxbw(vec1, Address(str1, 0));
3975     }
3976     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3977     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3978     subptr(cnt2, stride);
3979     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3980     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3981       lea(str1, Address(str1, result, scale));
3982       lea(str2, Address(str2, result, scale));
3983     } else {
3984       lea(str1, Address(str1, result, scale1));
3985       lea(str2, Address(str2, result, scale2));
3986     }
3987     negptr(cnt2);
3988     jmpb(WHILE_HEAD_LABEL);
3989 
3990     bind(COMPARE_SMALL_STR);
3991   } else if (UseSSE42Intrinsics) {
3992     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3993     int pcmpmask = 0x19;
3994     // Setup to compare 8-char (16-byte) vectors,
3995     // start from first character again because it has aligned address.
3996     movl(result, cnt2);
3997     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3998     if (ae == StrIntrinsicNode::LL) {
3999       pcmpmask &= ~0x01;
4000     }
4001     jcc(Assembler::zero, COMPARE_TAIL);
4002     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4003       lea(str1, Address(str1, result, scale));
4004       lea(str2, Address(str2, result, scale));
4005     } else {
4006       lea(str1, Address(str1, result, scale1));
4007       lea(str2, Address(str2, result, scale2));
4008     }
4009     negptr(result);
4010 
4011     // pcmpestri
4012     //   inputs:
4013     //     vec1- substring
4014     //     rax - negative string length (elements count)
4015     //     mem - scanned string
4016     //     rdx - string length (elements count)
4017     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
4018     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
4019     //   outputs:
4020     //     rcx - first mismatched element index
4021     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
4022 
4023     bind(COMPARE_WIDE_VECTORS);
4024     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4025       movdqu(vec1, Address(str1, result, scale));
4026       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
4027     } else {
4028       pmovzxbw(vec1, Address(str1, result, scale1));
4029       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
4030     }
4031     // After pcmpestri cnt1(rcx) contains mismatched element index
4032 
4033     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
4034     addptr(result, stride);
4035     subptr(cnt2, stride);
4036     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4037 
4038     // compare wide vectors tail
4039     testptr(result, result);
4040     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
4041 
4042     movl(cnt2, stride);
4043     movl(result, stride);
4044     negptr(result);
4045     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4046       movdqu(vec1, Address(str1, result, scale));
4047       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
4048     } else {
4049       pmovzxbw(vec1, Address(str1, result, scale1));
4050       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
4051     }
4052     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
4053 
4054     // Mismatched characters in the vectors
4055     bind(VECTOR_NOT_EQUAL);
4056     addptr(cnt1, result);
4057     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
4058     subl(result, cnt2);
4059     jmpb(POP_LABEL);
4060 
4061     bind(COMPARE_TAIL); // limit is zero
4062     movl(cnt2, result);
4063     // Fallthru to tail compare
4064   }
4065   // Shift str2 and str1 to the end of the arrays, negate min
4066   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4067     lea(str1, Address(str1, cnt2, scale));
4068     lea(str2, Address(str2, cnt2, scale));
4069   } else {
4070     lea(str1, Address(str1, cnt2, scale1));
4071     lea(str2, Address(str2, cnt2, scale2));
4072   }
4073   decrementl(cnt2);  // first character was compared already
4074   negptr(cnt2);
4075 
4076   // Compare the rest of the elements
4077   bind(WHILE_HEAD_LABEL);
4078   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
4079   subl(result, cnt1);
4080   jccb(Assembler::notZero, POP_LABEL);
4081   increment(cnt2);
4082   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
4083 
4084   // Strings are equal up to min length.  Return the length difference.
4085   bind(LENGTH_DIFF_LABEL);
4086   pop(result);
4087   if (ae == StrIntrinsicNode::UU) {
4088     // Divide diff by 2 to get number of chars
4089     sarl(result, 1);
4090   }
4091   jmpb(DONE_LABEL);
4092 
4093   if (VM_Version::supports_avx512vlbw()) {
4094 
4095     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
4096 
4097     kmovql(cnt1, mask);
4098     notq(cnt1);
4099     bsfq(cnt2, cnt1);
4100     if (ae != StrIntrinsicNode::LL) {
4101       // Divide diff by 2 to get number of chars
4102       sarl(cnt2, 1);
4103     }
4104     addq(result, cnt2);
4105     if (ae == StrIntrinsicNode::LL) {
4106       load_unsigned_byte(cnt1, Address(str2, result));
4107       load_unsigned_byte(result, Address(str1, result));
4108     } else if (ae == StrIntrinsicNode::UU) {
4109       load_unsigned_short(cnt1, Address(str2, result, scale));
4110       load_unsigned_short(result, Address(str1, result, scale));
4111     } else {
4112       load_unsigned_short(cnt1, Address(str2, result, scale2));
4113       load_unsigned_byte(result, Address(str1, result, scale1));
4114     }
4115     subl(result, cnt1);
4116     jmpb(POP_LABEL);
4117   }//if (VM_Version::supports_avx512vlbw())
4118 
4119   // Discard the stored length difference
4120   bind(POP_LABEL);
4121   pop(cnt1);
4122 
4123   // That's it
4124   bind(DONE_LABEL);
4125   if(ae == StrIntrinsicNode::UL) {
4126     negl(result);
4127   }
4128 
4129 }
4130 
4131 // Search for Non-ASCII character (Negative byte value) in a byte array,
4132 // return the index of the first such character, otherwise the length
4133 // of the array segment searched.
4134 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4135 //   @IntrinsicCandidate
4136 //   public static int countPositives(byte[] ba, int off, int len) {
4137 //     for (int i = off; i < off + len; i++) {
4138 //       if (ba[i] < 0) {
4139 //         return i - off;
4140 //       }
4141 //     }
4142 //     return len;
4143 //   }
4144 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4145   Register result, Register tmp1,
4146   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4147   // rsi: byte array
4148   // rcx: len
4149   // rax: result
4150   ShortBranchVerifier sbv(this);
4151   assert_different_registers(ary1, len, result, tmp1);
4152   assert_different_registers(vec1, vec2);
4153   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4154 
4155   movl(result, len); // copy
4156   // len == 0
4157   testl(len, len);
4158   jcc(Assembler::zero, DONE);
4159 
4160   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4161     VM_Version::supports_avx512vlbw() &&
4162     VM_Version::supports_bmi2()) {
4163 
4164     Label test_64_loop, test_tail, BREAK_LOOP;
4165     movl(tmp1, len);
4166     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4167 
4168     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4169     andl(len,  0xffffffc0); // vector count (in chars)
4170     jccb(Assembler::zero, test_tail);
4171 
4172     lea(ary1, Address(ary1, len, Address::times_1));
4173     negptr(len);
4174 
4175     bind(test_64_loop);
4176     // Check whether our 64 elements of size byte contain negatives
4177     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4178     kortestql(mask1, mask1);
4179     jcc(Assembler::notZero, BREAK_LOOP);
4180 
4181     addptr(len, 64);
4182     jccb(Assembler::notZero, test_64_loop);
4183 
4184     bind(test_tail);
4185     // bail out when there is nothing to be done
4186     testl(tmp1, -1);
4187     jcc(Assembler::zero, DONE);
4188 
4189 
4190     // check the tail for absense of negatives
4191     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4192     {
4193       Register tmp3_aliased = len;
4194       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4195       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4196       notq(tmp3_aliased);
4197       kmovql(mask2, tmp3_aliased);
4198     }
4199 
4200     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4201     ktestq(mask1, mask2);
4202     jcc(Assembler::zero, DONE);
4203 
4204     // do a full check for negative registers in the tail
4205     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4206                      // ary1 already pointing to the right place
4207     jmpb(TAIL_START);
4208 
4209     bind(BREAK_LOOP);
4210     // At least one byte in the last 64 byte block was negative.
4211     // Set up to look at the last 64 bytes as if they were a tail
4212     lea(ary1, Address(ary1, len, Address::times_1));
4213     addptr(result, len);
4214     // Ignore the very last byte: if all others are positive,
4215     // it must be negative, so we can skip right to the 2+1 byte
4216     // end comparison at this point
4217     orl(result, 63);
4218     movl(len, 63);
4219     // Fallthru to tail compare
4220   } else {
4221 
4222     if (UseAVX >= 2) {
4223       // With AVX2, use 32-byte vector compare
4224       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4225 
4226       // Compare 32-byte vectors
4227       testl(len, 0xffffffe0);   // vector count (in bytes)
4228       jccb(Assembler::zero, TAIL_START);
4229 
4230       andl(len, 0xffffffe0);
4231       lea(ary1, Address(ary1, len, Address::times_1));
4232       negptr(len);
4233 
4234       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4235       movdl(vec2, tmp1);
4236       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4237 
4238       bind(COMPARE_WIDE_VECTORS);
4239       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4240       vptest(vec1, vec2);
4241       jccb(Assembler::notZero, BREAK_LOOP);
4242       addptr(len, 32);
4243       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4244 
4245       testl(result, 0x0000001f);   // any bytes remaining?
4246       jcc(Assembler::zero, DONE);
4247 
4248       // Quick test using the already prepared vector mask
4249       movl(len, result);
4250       andl(len, 0x0000001f);
4251       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4252       vptest(vec1, vec2);
4253       jcc(Assembler::zero, DONE);
4254       // There are zeros, jump to the tail to determine exactly where
4255       jmpb(TAIL_START);
4256 
4257       bind(BREAK_LOOP);
4258       // At least one byte in the last 32-byte vector is negative.
4259       // Set up to look at the last 32 bytes as if they were a tail
4260       lea(ary1, Address(ary1, len, Address::times_1));
4261       addptr(result, len);
4262       // Ignore the very last byte: if all others are positive,
4263       // it must be negative, so we can skip right to the 2+1 byte
4264       // end comparison at this point
4265       orl(result, 31);
4266       movl(len, 31);
4267       // Fallthru to tail compare
4268     } else if (UseSSE42Intrinsics) {
4269       // With SSE4.2, use double quad vector compare
4270       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4271 
4272       // Compare 16-byte vectors
4273       testl(len, 0xfffffff0);   // vector count (in bytes)
4274       jcc(Assembler::zero, TAIL_START);
4275 
4276       andl(len, 0xfffffff0);
4277       lea(ary1, Address(ary1, len, Address::times_1));
4278       negptr(len);
4279 
4280       movl(tmp1, 0x80808080);
4281       movdl(vec2, tmp1);
4282       pshufd(vec2, vec2, 0);
4283 
4284       bind(COMPARE_WIDE_VECTORS);
4285       movdqu(vec1, Address(ary1, len, Address::times_1));
4286       ptest(vec1, vec2);
4287       jccb(Assembler::notZero, BREAK_LOOP);
4288       addptr(len, 16);
4289       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4290 
4291       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4292       jcc(Assembler::zero, DONE);
4293 
4294       // Quick test using the already prepared vector mask
4295       movl(len, result);
4296       andl(len, 0x0000000f);   // tail count (in bytes)
4297       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4298       ptest(vec1, vec2);
4299       jcc(Assembler::zero, DONE);
4300       jmpb(TAIL_START);
4301 
4302       bind(BREAK_LOOP);
4303       // At least one byte in the last 16-byte vector is negative.
4304       // Set up and look at the last 16 bytes as if they were a tail
4305       lea(ary1, Address(ary1, len, Address::times_1));
4306       addptr(result, len);
4307       // Ignore the very last byte: if all others are positive,
4308       // it must be negative, so we can skip right to the 2+1 byte
4309       // end comparison at this point
4310       orl(result, 15);
4311       movl(len, 15);
4312       // Fallthru to tail compare
4313     }
4314   }
4315 
4316   bind(TAIL_START);
4317   // Compare 4-byte vectors
4318   andl(len, 0xfffffffc); // vector count (in bytes)
4319   jccb(Assembler::zero, COMPARE_CHAR);
4320 
4321   lea(ary1, Address(ary1, len, Address::times_1));
4322   negptr(len);
4323 
4324   bind(COMPARE_VECTORS);
4325   movl(tmp1, Address(ary1, len, Address::times_1));
4326   andl(tmp1, 0x80808080);
4327   jccb(Assembler::notZero, TAIL_ADJUST);
4328   addptr(len, 4);
4329   jccb(Assembler::notZero, COMPARE_VECTORS);
4330 
4331   // Compare trailing char (final 2-3 bytes), if any
4332   bind(COMPARE_CHAR);
4333 
4334   testl(result, 0x2);   // tail  char
4335   jccb(Assembler::zero, COMPARE_BYTE);
4336   load_unsigned_short(tmp1, Address(ary1, 0));
4337   andl(tmp1, 0x00008080);
4338   jccb(Assembler::notZero, CHAR_ADJUST);
4339   lea(ary1, Address(ary1, 2));
4340 
4341   bind(COMPARE_BYTE);
4342   testl(result, 0x1);   // tail  byte
4343   jccb(Assembler::zero, DONE);
4344   load_unsigned_byte(tmp1, Address(ary1, 0));
4345   testl(tmp1, 0x00000080);
4346   jccb(Assembler::zero, DONE);
4347   subptr(result, 1);
4348   jmpb(DONE);
4349 
4350   bind(TAIL_ADJUST);
4351   // there are negative bits in the last 4 byte block.
4352   // Adjust result and check the next three bytes
4353   addptr(result, len);
4354   orl(result, 3);
4355   lea(ary1, Address(ary1, len, Address::times_1));
4356   jmpb(COMPARE_CHAR);
4357 
4358   bind(CHAR_ADJUST);
4359   // We are looking at a char + optional byte tail, and found that one
4360   // of the bytes in the char is negative. Adjust the result, check the
4361   // first byte and readjust if needed.
4362   andl(result, 0xfffffffc);
4363   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4364   jccb(Assembler::notZero, DONE);
4365   addptr(result, 1);
4366 
4367   // That's it
4368   bind(DONE);
4369   if (UseAVX >= 2) {
4370     // clean upper bits of YMM registers
4371     vpxor(vec1, vec1);
4372     vpxor(vec2, vec2);
4373   }
4374 }
4375 
4376 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4377 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4378                                       Register limit, Register result, Register chr,
4379                                       XMMRegister vec1, XMMRegister vec2, bool is_char,
4380                                       KRegister mask, bool expand_ary2) {
4381   // for expand_ary2, limit is the (smaller) size of the second array.
4382   ShortBranchVerifier sbv(this);
4383   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4384 
4385   assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4386          "Expansion only implemented for AVX2");
4387 
4388   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4389   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4390 
4391   Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4392   int scaleIncr = expand_ary2 ? 8 : 16;
4393 
4394   if (is_array_equ) {
4395     // Check the input args
4396     cmpoop(ary1, ary2);
4397     jcc(Assembler::equal, TRUE_LABEL);
4398 
4399     // Need additional checks for arrays_equals.
4400     testptr(ary1, ary1);
4401     jcc(Assembler::zero, FALSE_LABEL);
4402     testptr(ary2, ary2);
4403     jcc(Assembler::zero, FALSE_LABEL);
4404 
4405     // Check the lengths
4406     movl(limit, Address(ary1, length_offset));
4407     cmpl(limit, Address(ary2, length_offset));
4408     jcc(Assembler::notEqual, FALSE_LABEL);
4409   }
4410 
4411   // count == 0
4412   testl(limit, limit);
4413   jcc(Assembler::zero, TRUE_LABEL);
4414 
4415   if (is_array_equ) {
4416     // Load array address
4417     lea(ary1, Address(ary1, base_offset));
4418     lea(ary2, Address(ary2, base_offset));
4419   }
4420 
4421   if (is_array_equ && is_char) {
4422     // arrays_equals when used for char[].
4423     shll(limit, 1);      // byte count != 0
4424   }
4425   movl(result, limit); // copy
4426 
4427   if (UseAVX >= 2) {
4428     // With AVX2, use 32-byte vector compare
4429     Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4430 
4431     // Compare 32-byte vectors
4432     if (expand_ary2) {
4433       andl(result, 0x0000000f);  //   tail count (in bytes)
4434       andl(limit, 0xfffffff0);   // vector count (in bytes)
4435       jcc(Assembler::zero, COMPARE_TAIL);
4436     } else {
4437       andl(result, 0x0000001f);  //   tail count (in bytes)
4438       andl(limit, 0xffffffe0);   // vector count (in bytes)
4439       jcc(Assembler::zero, COMPARE_TAIL_16);
4440     }
4441 
4442     lea(ary1, Address(ary1, limit, scaleFactor));
4443     lea(ary2, Address(ary2, limit, Address::times_1));
4444     negptr(limit);
4445 
4446     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4447       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4448 
4449       cmpl(limit, -64);
4450       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4451 
4452       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4453 
4454       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4455       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4456       kortestql(mask, mask);
4457       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4458       addptr(limit, 64);  // update since we already compared at this addr
4459       cmpl(limit, -64);
4460       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4461 
4462       // At this point we may still need to compare -limit+result bytes.
4463       // We could execute the next two instruction and just continue via non-wide path:
4464       //  cmpl(limit, 0);
4465       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4466       // But since we stopped at the points ary{1,2}+limit which are
4467       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4468       // (|limit| <= 32 and result < 32),
4469       // we may just compare the last 64 bytes.
4470       //
4471       addptr(result, -64);   // it is safe, bc we just came from this area
4472       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4473       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4474       kortestql(mask, mask);
4475       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4476 
4477       jmp(TRUE_LABEL);
4478 
4479       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4480 
4481     }//if (VM_Version::supports_avx512vlbw())
4482 
4483     bind(COMPARE_WIDE_VECTORS);
4484     vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4485     if (expand_ary2) {
4486       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4487     } else {
4488       vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4489     }
4490     vpxor(vec1, vec2);
4491 
4492     vptest(vec1, vec1);
4493     jcc(Assembler::notZero, FALSE_LABEL);
4494     addptr(limit, scaleIncr * 2);
4495     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4496 
4497     testl(result, result);
4498     jcc(Assembler::zero, TRUE_LABEL);
4499 
4500     vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4501     if (expand_ary2) {
4502       vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4503     } else {
4504       vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4505     }
4506     vpxor(vec1, vec2);
4507 
4508     vptest(vec1, vec1);
4509     jcc(Assembler::notZero, FALSE_LABEL);
4510     jmp(TRUE_LABEL);
4511 
4512     bind(COMPARE_TAIL_16); // limit is zero
4513     movl(limit, result);
4514 
4515     // Compare 16-byte chunks
4516     andl(result, 0x0000000f);  //   tail count (in bytes)
4517     andl(limit, 0xfffffff0);   // vector count (in bytes)
4518     jcc(Assembler::zero, COMPARE_TAIL);
4519 
4520     lea(ary1, Address(ary1, limit, scaleFactor));
4521     lea(ary2, Address(ary2, limit, Address::times_1));
4522     negptr(limit);
4523 
4524     bind(COMPARE_WIDE_VECTORS_16);
4525     movdqu(vec1, Address(ary1, limit, scaleFactor));
4526     if (expand_ary2) {
4527       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4528     } else {
4529       movdqu(vec2, Address(ary2, limit, Address::times_1));
4530     }
4531     pxor(vec1, vec2);
4532 
4533     ptest(vec1, vec1);
4534     jcc(Assembler::notZero, FALSE_LABEL);
4535     addptr(limit, scaleIncr);
4536     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4537 
4538     bind(COMPARE_TAIL); // limit is zero
4539     movl(limit, result);
4540     // Fallthru to tail compare
4541   } else if (UseSSE42Intrinsics) {
4542     // With SSE4.2, use double quad vector compare
4543     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4544 
4545     // Compare 16-byte vectors
4546     andl(result, 0x0000000f);  //   tail count (in bytes)
4547     andl(limit, 0xfffffff0);   // vector count (in bytes)
4548     jcc(Assembler::zero, COMPARE_TAIL);
4549 
4550     lea(ary1, Address(ary1, limit, Address::times_1));
4551     lea(ary2, Address(ary2, limit, Address::times_1));
4552     negptr(limit);
4553 
4554     bind(COMPARE_WIDE_VECTORS);
4555     movdqu(vec1, Address(ary1, limit, Address::times_1));
4556     movdqu(vec2, Address(ary2, limit, Address::times_1));
4557     pxor(vec1, vec2);
4558 
4559     ptest(vec1, vec1);
4560     jcc(Assembler::notZero, FALSE_LABEL);
4561     addptr(limit, 16);
4562     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4563 
4564     testl(result, result);
4565     jcc(Assembler::zero, TRUE_LABEL);
4566 
4567     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4568     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4569     pxor(vec1, vec2);
4570 
4571     ptest(vec1, vec1);
4572     jccb(Assembler::notZero, FALSE_LABEL);
4573     jmpb(TRUE_LABEL);
4574 
4575     bind(COMPARE_TAIL); // limit is zero
4576     movl(limit, result);
4577     // Fallthru to tail compare
4578   }
4579 
4580   // Compare 4-byte vectors
4581   if (expand_ary2) {
4582     testl(result, result);
4583     jccb(Assembler::zero, TRUE_LABEL);
4584   } else {
4585     andl(limit, 0xfffffffc); // vector count (in bytes)
4586     jccb(Assembler::zero, COMPARE_CHAR);
4587   }
4588 
4589   lea(ary1, Address(ary1, limit, scaleFactor));
4590   lea(ary2, Address(ary2, limit, Address::times_1));
4591   negptr(limit);
4592 
4593   bind(COMPARE_VECTORS);
4594   if (expand_ary2) {
4595     // There are no "vector" operations for bytes to shorts
4596     movzbl(chr, Address(ary2, limit, Address::times_1));
4597     cmpw(Address(ary1, limit, Address::times_2), chr);
4598     jccb(Assembler::notEqual, FALSE_LABEL);
4599     addptr(limit, 1);
4600     jcc(Assembler::notZero, COMPARE_VECTORS);
4601     jmp(TRUE_LABEL);
4602   } else {
4603     movl(chr, Address(ary1, limit, Address::times_1));
4604     cmpl(chr, Address(ary2, limit, Address::times_1));
4605     jccb(Assembler::notEqual, FALSE_LABEL);
4606     addptr(limit, 4);
4607     jcc(Assembler::notZero, COMPARE_VECTORS);
4608   }
4609 
4610   // Compare trailing char (final 2 bytes), if any
4611   bind(COMPARE_CHAR);
4612   testl(result, 0x2);   // tail  char
4613   jccb(Assembler::zero, COMPARE_BYTE);
4614   load_unsigned_short(chr, Address(ary1, 0));
4615   load_unsigned_short(limit, Address(ary2, 0));
4616   cmpl(chr, limit);
4617   jccb(Assembler::notEqual, FALSE_LABEL);
4618 
4619   if (is_array_equ && is_char) {
4620     bind(COMPARE_BYTE);
4621   } else {
4622     lea(ary1, Address(ary1, 2));
4623     lea(ary2, Address(ary2, 2));
4624 
4625     bind(COMPARE_BYTE);
4626     testl(result, 0x1);   // tail  byte
4627     jccb(Assembler::zero, TRUE_LABEL);
4628     load_unsigned_byte(chr, Address(ary1, 0));
4629     load_unsigned_byte(limit, Address(ary2, 0));
4630     cmpl(chr, limit);
4631     jccb(Assembler::notEqual, FALSE_LABEL);
4632   }
4633   bind(TRUE_LABEL);
4634   movl(result, 1);   // return true
4635   jmpb(DONE);
4636 
4637   bind(FALSE_LABEL);
4638   xorl(result, result); // return false
4639 
4640   // That's it
4641   bind(DONE);
4642   if (UseAVX >= 2) {
4643     // clean upper bits of YMM registers
4644     vpxor(vec1, vec1);
4645     vpxor(vec2, vec2);
4646   }
4647 }
4648 
4649 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4650 #define __ masm.
4651   Register dst = stub.data<0>();
4652   XMMRegister src = stub.data<1>();
4653   address target = stub.data<2>();
4654   __ bind(stub.entry());
4655   __ subptr(rsp, 8);
4656   __ movdbl(Address(rsp), src);
4657   __ call(RuntimeAddress(target));
4658   __ pop(dst);
4659   __ jmp(stub.continuation());
4660 #undef __
4661 }
4662 
4663 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4664   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4665   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4666 
4667   address slowpath_target;
4668   if (dst_bt == T_INT) {
4669     if (src_bt == T_FLOAT) {
4670       cvttss2sil(dst, src);
4671       cmpl(dst, 0x80000000);
4672       slowpath_target = StubRoutines::x86::f2i_fixup();
4673     } else {
4674       cvttsd2sil(dst, src);
4675       cmpl(dst, 0x80000000);
4676       slowpath_target = StubRoutines::x86::d2i_fixup();
4677     }
4678   } else {
4679     if (src_bt == T_FLOAT) {
4680       cvttss2siq(dst, src);
4681       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4682       slowpath_target = StubRoutines::x86::f2l_fixup();
4683     } else {
4684       cvttsd2siq(dst, src);
4685       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4686       slowpath_target = StubRoutines::x86::d2l_fixup();
4687     }
4688   }
4689 
4690   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath);
4691   jcc(Assembler::equal, stub->entry());
4692   bind(stub->continuation());
4693 }
4694 
4695 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4696                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4697   switch(ideal_opc) {
4698     case Op_LShiftVS:
4699       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4700     case Op_LShiftVI:
4701       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4702     case Op_LShiftVL:
4703       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4704     case Op_RShiftVS:
4705       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4706     case Op_RShiftVI:
4707       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4708     case Op_RShiftVL:
4709       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4710     case Op_URShiftVS:
4711       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4712     case Op_URShiftVI:
4713       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4714     case Op_URShiftVL:
4715       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4716     case Op_RotateRightV:
4717       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4718     case Op_RotateLeftV:
4719       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4720     default:
4721       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4722       break;
4723   }
4724 }
4725 
4726 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4727                                                XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4728   if (is_unsigned) {
4729     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4730   } else {
4731     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4732   }
4733 }
4734 
4735 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4736                                                       XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4737   switch (elem_bt) {
4738     case T_BYTE:
4739       if (ideal_opc == Op_SaturatingAddV) {
4740         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4741       } else {
4742         assert(ideal_opc == Op_SaturatingSubV, "");
4743         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4744       }
4745       break;
4746     case T_SHORT:
4747       if (ideal_opc == Op_SaturatingAddV) {
4748         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4749       } else {
4750         assert(ideal_opc == Op_SaturatingSubV, "");
4751         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4752       }
4753       break;
4754     default:
4755       fatal("Unsupported type %s", type2name(elem_bt));
4756       break;
4757   }
4758 }
4759 
4760 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4761                                                         XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4762   switch (elem_bt) {
4763     case T_BYTE:
4764       if (ideal_opc == Op_SaturatingAddV) {
4765         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4766       } else {
4767         assert(ideal_opc == Op_SaturatingSubV, "");
4768         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4769       }
4770       break;
4771     case T_SHORT:
4772       if (ideal_opc == Op_SaturatingAddV) {
4773         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4774       } else {
4775         assert(ideal_opc == Op_SaturatingSubV, "");
4776         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4777       }
4778       break;
4779     default:
4780       fatal("Unsupported type %s", type2name(elem_bt));
4781       break;
4782   }
4783 }
4784 
4785 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4786                                                Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4787   if (is_unsigned) {
4788     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4789   } else {
4790     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4791   }
4792 }
4793 
4794 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4795                                                       XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4796   switch (elem_bt) {
4797     case T_BYTE:
4798       if (ideal_opc == Op_SaturatingAddV) {
4799         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4800       } else {
4801         assert(ideal_opc == Op_SaturatingSubV, "");
4802         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4803       }
4804       break;
4805     case T_SHORT:
4806       if (ideal_opc == Op_SaturatingAddV) {
4807         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4808       } else {
4809         assert(ideal_opc == Op_SaturatingSubV, "");
4810         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4811       }
4812       break;
4813     default:
4814       fatal("Unsupported type %s", type2name(elem_bt));
4815       break;
4816   }
4817 }
4818 
4819 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4820                                                         XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4821   switch (elem_bt) {
4822     case T_BYTE:
4823       if (ideal_opc == Op_SaturatingAddV) {
4824         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4825       } else {
4826         assert(ideal_opc == Op_SaturatingSubV, "");
4827         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4828       }
4829       break;
4830     case T_SHORT:
4831       if (ideal_opc == Op_SaturatingAddV) {
4832         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4833       } else {
4834         assert(ideal_opc == Op_SaturatingSubV, "");
4835         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4836       }
4837       break;
4838     default:
4839       fatal("Unsupported type %s", type2name(elem_bt));
4840       break;
4841   }
4842 }
4843 
4844 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4845                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4846                                     bool is_varshift) {
4847   switch (ideal_opc) {
4848     case Op_AddVB:
4849       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4850     case Op_AddVS:
4851       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4852     case Op_AddVI:
4853       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4854     case Op_AddVL:
4855       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4856     case Op_AddVF:
4857       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4858     case Op_AddVD:
4859       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4860     case Op_SubVB:
4861       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4862     case Op_SubVS:
4863       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4864     case Op_SubVI:
4865       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4866     case Op_SubVL:
4867       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4868     case Op_SubVF:
4869       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4870     case Op_SubVD:
4871       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4872     case Op_MulVS:
4873       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4874     case Op_MulVI:
4875       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4876     case Op_MulVL:
4877       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4878     case Op_MulVF:
4879       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4880     case Op_MulVD:
4881       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4882     case Op_DivVF:
4883       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4884     case Op_DivVD:
4885       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4886     case Op_SqrtVF:
4887       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4888     case Op_SqrtVD:
4889       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4890     case Op_AbsVB:
4891       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4892     case Op_AbsVS:
4893       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4894     case Op_AbsVI:
4895       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4896     case Op_AbsVL:
4897       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4898     case Op_FmaVF:
4899       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4900     case Op_FmaVD:
4901       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4902     case Op_VectorRearrange:
4903       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4904     case Op_LShiftVS:
4905       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4906     case Op_LShiftVI:
4907       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4908     case Op_LShiftVL:
4909       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4910     case Op_RShiftVS:
4911       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4912     case Op_RShiftVI:
4913       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4914     case Op_RShiftVL:
4915       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4916     case Op_URShiftVS:
4917       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4918     case Op_URShiftVI:
4919       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4920     case Op_URShiftVL:
4921       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4922     case Op_RotateLeftV:
4923       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4924     case Op_RotateRightV:
4925       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4926     case Op_MaxV:
4927       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4928     case Op_MinV:
4929       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4930     case Op_UMinV:
4931       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4932     case Op_UMaxV:
4933       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4934     case Op_XorV:
4935       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4936     case Op_OrV:
4937       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4938     case Op_AndV:
4939       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4940     default:
4941       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4942       break;
4943   }
4944 }
4945 
4946 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4947                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4948   switch (ideal_opc) {
4949     case Op_AddVB:
4950       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4951     case Op_AddVS:
4952       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4953     case Op_AddVI:
4954       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4955     case Op_AddVL:
4956       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4957     case Op_AddVF:
4958       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4959     case Op_AddVD:
4960       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4961     case Op_SubVB:
4962       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4963     case Op_SubVS:
4964       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4965     case Op_SubVI:
4966       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4967     case Op_SubVL:
4968       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4969     case Op_SubVF:
4970       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4971     case Op_SubVD:
4972       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4973     case Op_MulVS:
4974       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4975     case Op_MulVI:
4976       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4977     case Op_MulVL:
4978       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4979     case Op_MulVF:
4980       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4981     case Op_MulVD:
4982       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4983     case Op_DivVF:
4984       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4985     case Op_DivVD:
4986       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4987     case Op_FmaVF:
4988       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4989     case Op_FmaVD:
4990       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4991     case Op_MaxV:
4992       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4993     case Op_MinV:
4994       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4995     case Op_UMaxV:
4996       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4997     case Op_UMinV:
4998       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4999     case Op_XorV:
5000       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5001     case Op_OrV:
5002       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5003     case Op_AndV:
5004       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5005     default:
5006       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
5007       break;
5008   }
5009 }
5010 
5011 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
5012                                   KRegister src1, KRegister src2) {
5013   BasicType etype = T_ILLEGAL;
5014   switch(mask_len) {
5015     case 2:
5016     case 4:
5017     case 8:  etype = T_BYTE; break;
5018     case 16: etype = T_SHORT; break;
5019     case 32: etype = T_INT; break;
5020     case 64: etype = T_LONG; break;
5021     default: fatal("Unsupported type"); break;
5022   }
5023   assert(etype != T_ILLEGAL, "");
5024   switch(ideal_opc) {
5025     case Op_AndVMask:
5026       kand(etype, dst, src1, src2); break;
5027     case Op_OrVMask:
5028       kor(etype, dst, src1, src2); break;
5029     case Op_XorVMask:
5030       kxor(etype, dst, src1, src2); break;
5031     default:
5032       fatal("Unsupported masked operation"); break;
5033   }
5034 }
5035 
5036 /*
5037  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5038  * If src is NaN, the result is 0.
5039  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
5040  * the result is equal to the value of Integer.MIN_VALUE.
5041  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
5042  * the result is equal to the value of Integer.MAX_VALUE.
5043  */
5044 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5045                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5046                                                                    Register rscratch, AddressLiteral float_sign_flip,
5047                                                                    int vec_enc) {
5048   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5049   Label done;
5050   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
5051   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
5052   vptest(xtmp2, xtmp2, vec_enc);
5053   jccb(Assembler::equal, done);
5054 
5055   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
5056   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
5057 
5058   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5059   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
5060   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
5061 
5062   // Recompute the mask for remaining special value.
5063   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
5064   // Extract SRC values corresponding to TRUE mask lanes.
5065   vpand(xtmp4, xtmp2, src, vec_enc);
5066   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
5067   // values are set.
5068   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
5069 
5070   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
5071   bind(done);
5072 }
5073 
5074 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5075                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5076                                                                     Register rscratch, AddressLiteral float_sign_flip,
5077                                                                     int vec_enc) {
5078   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5079   Label done;
5080   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5081   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5082   kortestwl(ktmp1, ktmp1);
5083   jccb(Assembler::equal, done);
5084 
5085   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5086   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5087   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5088 
5089   kxorwl(ktmp1, ktmp1, ktmp2);
5090   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5091   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5092   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5093   bind(done);
5094 }
5095 
5096 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5097                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5098                                                                      Register rscratch, AddressLiteral double_sign_flip,
5099                                                                      int vec_enc) {
5100   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5101 
5102   Label done;
5103   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5104   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
5105   kortestwl(ktmp1, ktmp1);
5106   jccb(Assembler::equal, done);
5107 
5108   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5109   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5110   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5111 
5112   kxorwl(ktmp1, ktmp1, ktmp2);
5113   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5114   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5115   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5116   bind(done);
5117 }
5118 
5119 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5120                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5121                                                                      Register rscratch, AddressLiteral float_sign_flip,
5122                                                                      int vec_enc) {
5123   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5124   Label done;
5125   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5126   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5127   kortestwl(ktmp1, ktmp1);
5128   jccb(Assembler::equal, done);
5129 
5130   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5131   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5132   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5133 
5134   kxorwl(ktmp1, ktmp1, ktmp2);
5135   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5136   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5137   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5138   bind(done);
5139 }
5140 
5141 /*
5142  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5143  * If src is NaN, the result is 0.
5144  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
5145  * the result is equal to the value of Long.MIN_VALUE.
5146  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
5147  * the result is equal to the value of Long.MAX_VALUE.
5148  */
5149 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5150                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5151                                                                       Register rscratch, AddressLiteral double_sign_flip,
5152                                                                       int vec_enc) {
5153   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5154 
5155   Label done;
5156   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5157   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
5158   kortestwl(ktmp1, ktmp1);
5159   jccb(Assembler::equal, done);
5160 
5161   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5162   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5163   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5164 
5165   kxorwl(ktmp1, ktmp1, ktmp2);
5166   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5167   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5168   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5169   bind(done);
5170 }
5171 
5172 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5173                                                              XMMRegister xtmp, int index, int vec_enc) {
5174    assert(vec_enc < Assembler::AVX_512bit, "");
5175    if (vec_enc == Assembler::AVX_256bit) {
5176      vextractf128_high(xtmp, src);
5177      vshufps(dst, src, xtmp, index, vec_enc);
5178    } else {
5179      vshufps(dst, src, zero, index, vec_enc);
5180    }
5181 }
5182 
5183 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5184                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5185                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
5186   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5187 
5188   Label done;
5189   // Compare the destination lanes with float_sign_flip
5190   // value to get mask for all special values.
5191   movdqu(xtmp1, float_sign_flip, rscratch);
5192   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5193   ptest(xtmp2, xtmp2);
5194   jccb(Assembler::equal, done);
5195 
5196   // Flip float_sign_flip to get max integer value.
5197   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5198   pxor(xtmp1, xtmp4);
5199 
5200   // Set detination lanes corresponding to unordered source lanes as zero.
5201   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5202   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5203 
5204   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5205   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5206   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5207 
5208   // Recompute the mask for remaining special value.
5209   pxor(xtmp2, xtmp3);
5210   // Extract mask corresponding to non-negative source lanes.
5211   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5212 
5213   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5214   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5215   pand(xtmp3, xtmp2);
5216 
5217   // Replace destination lanes holding special value(0x80000000) with max int
5218   // if corresponding source lane holds a +ve value.
5219   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5220   bind(done);
5221 }
5222 
5223 
5224 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5225                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5226   switch(to_elem_bt) {
5227     case T_SHORT:
5228       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5229       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5230       vpackusdw(dst, dst, zero, vec_enc);
5231       if (vec_enc == Assembler::AVX_256bit) {
5232         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5233       }
5234       break;
5235     case  T_BYTE:
5236       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5237       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5238       vpackusdw(dst, dst, zero, vec_enc);
5239       if (vec_enc == Assembler::AVX_256bit) {
5240         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5241       }
5242       vpackuswb(dst, dst, zero, vec_enc);
5243       break;
5244     default: assert(false, "%s", type2name(to_elem_bt));
5245   }
5246 }
5247 
5248 /*
5249  * Algorithm for vector D2L and F2I conversions:-
5250  * a) Perform vector D2L/F2I cast.
5251  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5252  *    It signifies that source value could be any of the special floating point
5253  *    values(NaN,-Inf,Inf,Max,-Min).
5254  * c) Set destination to zero if source is NaN value.
5255  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5256  */
5257 
5258 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5259                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5260                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5261   int to_elem_sz = type2aelembytes(to_elem_bt);
5262   assert(to_elem_sz <= 4, "");
5263   vcvttps2dq(dst, src, vec_enc);
5264   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5265   if (to_elem_sz < 4) {
5266     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5267     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5268   }
5269 }
5270 
5271 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5272                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5273                                             Register rscratch, int vec_enc) {
5274   int to_elem_sz = type2aelembytes(to_elem_bt);
5275   assert(to_elem_sz <= 4, "");
5276   vcvttps2dq(dst, src, vec_enc);
5277   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5278   switch(to_elem_bt) {
5279     case T_INT:
5280       break;
5281     case T_SHORT:
5282       evpmovdw(dst, dst, vec_enc);
5283       break;
5284     case T_BYTE:
5285       evpmovdb(dst, dst, vec_enc);
5286       break;
5287     default: assert(false, "%s", type2name(to_elem_bt));
5288   }
5289 }
5290 
5291 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5292                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5293                                             Register rscratch, int vec_enc) {
5294   evcvttps2qq(dst, src, vec_enc);
5295   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5296 }
5297 
5298 // Handling for downcasting from double to integer or sub-word types on AVX2.
5299 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5300                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5301                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5302   int to_elem_sz = type2aelembytes(to_elem_bt);
5303   assert(to_elem_sz < 8, "");
5304   vcvttpd2dq(dst, src, vec_enc);
5305   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5306                                               float_sign_flip, vec_enc);
5307   if (to_elem_sz < 4) {
5308     // xtmp4 holds all zero lanes.
5309     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5310   }
5311 }
5312 
5313 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5314                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5315                                             KRegister ktmp2, AddressLiteral sign_flip,
5316                                             Register rscratch, int vec_enc) {
5317   if (VM_Version::supports_avx512dq()) {
5318     evcvttpd2qq(dst, src, vec_enc);
5319     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5320     switch(to_elem_bt) {
5321       case T_LONG:
5322         break;
5323       case T_INT:
5324         evpmovsqd(dst, dst, vec_enc);
5325         break;
5326       case T_SHORT:
5327         evpmovsqd(dst, dst, vec_enc);
5328         evpmovdw(dst, dst, vec_enc);
5329         break;
5330       case T_BYTE:
5331         evpmovsqd(dst, dst, vec_enc);
5332         evpmovdb(dst, dst, vec_enc);
5333         break;
5334       default: assert(false, "%s", type2name(to_elem_bt));
5335     }
5336   } else {
5337     assert(type2aelembytes(to_elem_bt) <= 4, "");
5338     vcvttpd2dq(dst, src, vec_enc);
5339     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5340     switch(to_elem_bt) {
5341       case T_INT:
5342         break;
5343       case T_SHORT:
5344         evpmovdw(dst, dst, vec_enc);
5345         break;
5346       case T_BYTE:
5347         evpmovdb(dst, dst, vec_enc);
5348         break;
5349       default: assert(false, "%s", type2name(to_elem_bt));
5350     }
5351   }
5352 }
5353 
5354 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5355                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5356                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5357   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5358   // and re-instantiate original MXCSR.RC mode after that.
5359   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5360 
5361   mov64(tmp, julong_cast(0.5L));
5362   evpbroadcastq(xtmp1, tmp, vec_enc);
5363   vaddpd(xtmp1, src , xtmp1, vec_enc);
5364   evcvtpd2qq(dst, xtmp1, vec_enc);
5365   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5366                                                 double_sign_flip, vec_enc);;
5367 
5368   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5369 }
5370 
5371 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5372                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5373                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5374   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5375   // and re-instantiate original MXCSR.RC mode after that.
5376   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5377 
5378   movl(tmp, jint_cast(0.5));
5379   movq(xtmp1, tmp);
5380   vbroadcastss(xtmp1, xtmp1, vec_enc);
5381   vaddps(xtmp1, src , xtmp1, vec_enc);
5382   vcvtps2dq(dst, xtmp1, vec_enc);
5383   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5384                                               float_sign_flip, vec_enc);
5385 
5386   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5387 }
5388 
5389 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5390                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5391                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5392   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5393   // and re-instantiate original MXCSR.RC mode after that.
5394   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5395 
5396   movl(tmp, jint_cast(0.5));
5397   movq(xtmp1, tmp);
5398   vbroadcastss(xtmp1, xtmp1, vec_enc);
5399   vaddps(xtmp1, src , xtmp1, vec_enc);
5400   vcvtps2dq(dst, xtmp1, vec_enc);
5401   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5402 
5403   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5404 }
5405 
5406 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5407                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5408   switch (from_elem_bt) {
5409     case T_BYTE:
5410       switch (to_elem_bt) {
5411         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5412         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5413         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5414         default: ShouldNotReachHere();
5415       }
5416       break;
5417     case T_SHORT:
5418       switch (to_elem_bt) {
5419         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5420         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5421         default: ShouldNotReachHere();
5422       }
5423       break;
5424     case T_INT:
5425       assert(to_elem_bt == T_LONG, "");
5426       vpmovzxdq(dst, src, vlen_enc);
5427       break;
5428     default:
5429       ShouldNotReachHere();
5430   }
5431 }
5432 
5433 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5434                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5435   switch (from_elem_bt) {
5436     case T_BYTE:
5437       switch (to_elem_bt) {
5438         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5439         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5440         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5441         default: ShouldNotReachHere();
5442       }
5443       break;
5444     case T_SHORT:
5445       switch (to_elem_bt) {
5446         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5447         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5448         default: ShouldNotReachHere();
5449       }
5450       break;
5451     case T_INT:
5452       assert(to_elem_bt == T_LONG, "");
5453       vpmovsxdq(dst, src, vlen_enc);
5454       break;
5455     default:
5456       ShouldNotReachHere();
5457   }
5458 }
5459 
5460 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5461                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5462   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5463   assert(vlen_enc != AVX_512bit, "");
5464 
5465   int dst_bt_size = type2aelembytes(dst_bt);
5466   int src_bt_size = type2aelembytes(src_bt);
5467   if (dst_bt_size > src_bt_size) {
5468     switch (dst_bt_size / src_bt_size) {
5469       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5470       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5471       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5472       default: ShouldNotReachHere();
5473     }
5474   } else {
5475     assert(dst_bt_size < src_bt_size, "");
5476     switch (src_bt_size / dst_bt_size) {
5477       case 2: {
5478         if (vlen_enc == AVX_128bit) {
5479           vpacksswb(dst, src, src, vlen_enc);
5480         } else {
5481           vpacksswb(dst, src, src, vlen_enc);
5482           vpermq(dst, dst, 0x08, vlen_enc);
5483         }
5484         break;
5485       }
5486       case 4: {
5487         if (vlen_enc == AVX_128bit) {
5488           vpackssdw(dst, src, src, vlen_enc);
5489           vpacksswb(dst, dst, dst, vlen_enc);
5490         } else {
5491           vpackssdw(dst, src, src, vlen_enc);
5492           vpermq(dst, dst, 0x08, vlen_enc);
5493           vpacksswb(dst, dst, dst, AVX_128bit);
5494         }
5495         break;
5496       }
5497       case 8: {
5498         if (vlen_enc == AVX_128bit) {
5499           vpshufd(dst, src, 0x08, vlen_enc);
5500           vpackssdw(dst, dst, dst, vlen_enc);
5501           vpacksswb(dst, dst, dst, vlen_enc);
5502         } else {
5503           vpshufd(dst, src, 0x08, vlen_enc);
5504           vpermq(dst, dst, 0x08, vlen_enc);
5505           vpackssdw(dst, dst, dst, AVX_128bit);
5506           vpacksswb(dst, dst, dst, AVX_128bit);
5507         }
5508         break;
5509       }
5510       default: ShouldNotReachHere();
5511     }
5512   }
5513 }
5514 
5515 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5516                                    bool merge, BasicType bt, int vlen_enc) {
5517   if (bt == T_INT) {
5518     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5519   } else {
5520     assert(bt == T_LONG, "");
5521     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5522   }
5523 }
5524 
5525 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5526                                    bool merge, BasicType bt, int vlen_enc) {
5527   if (bt == T_INT) {
5528     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5529   } else {
5530     assert(bt == T_LONG, "");
5531     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5532   }
5533 }
5534 
5535 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5536                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5537                                                int vec_enc) {
5538   int index = 0;
5539   int vindex = 0;
5540   mov64(rtmp1, 0x0101010101010101L);
5541   pdepq(rtmp1, src, rtmp1);
5542   if (mask_len > 8) {
5543     movq(rtmp2, src);
5544     vpxor(xtmp, xtmp, xtmp, vec_enc);
5545     movq(xtmp, rtmp1);
5546   }
5547   movq(dst, rtmp1);
5548 
5549   mask_len -= 8;
5550   while (mask_len > 0) {
5551     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5552     index++;
5553     if ((index % 2) == 0) {
5554       pxor(xtmp, xtmp);
5555     }
5556     mov64(rtmp1, 0x0101010101010101L);
5557     shrq(rtmp2, 8);
5558     pdepq(rtmp1, rtmp2, rtmp1);
5559     pinsrq(xtmp, rtmp1, index % 2);
5560     vindex = index / 2;
5561     if (vindex) {
5562       // Write entire 16 byte vector when both 64 bit
5563       // lanes are update to save redundant instructions.
5564       if (index % 2) {
5565         vinsertf128(dst, dst, xtmp, vindex);
5566       }
5567     } else {
5568       vmovdqu(dst, xtmp);
5569     }
5570     mask_len -= 8;
5571   }
5572 }
5573 
5574 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5575   switch(opc) {
5576     case Op_VectorMaskTrueCount:
5577       popcntq(dst, tmp);
5578       break;
5579     case Op_VectorMaskLastTrue:
5580       if (VM_Version::supports_lzcnt()) {
5581         lzcntq(tmp, tmp);
5582         movl(dst, 63);
5583         subl(dst, tmp);
5584       } else {
5585         movl(dst, -1);
5586         bsrq(tmp, tmp);
5587         cmov32(Assembler::notZero, dst, tmp);
5588       }
5589       break;
5590     case Op_VectorMaskFirstTrue:
5591       if (VM_Version::supports_bmi1()) {
5592         if (masklen < 32) {
5593           orl(tmp, 1 << masklen);
5594           tzcntl(dst, tmp);
5595         } else if (masklen == 32) {
5596           tzcntl(dst, tmp);
5597         } else {
5598           assert(masklen == 64, "");
5599           tzcntq(dst, tmp);
5600         }
5601       } else {
5602         if (masklen < 32) {
5603           orl(tmp, 1 << masklen);
5604           bsfl(dst, tmp);
5605         } else {
5606           assert(masklen == 32 || masklen == 64, "");
5607           movl(dst, masklen);
5608           if (masklen == 32)  {
5609             bsfl(tmp, tmp);
5610           } else {
5611             bsfq(tmp, tmp);
5612           }
5613           cmov32(Assembler::notZero, dst, tmp);
5614         }
5615       }
5616       break;
5617     case Op_VectorMaskToLong:
5618       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5619       break;
5620     default: assert(false, "Unhandled mask operation");
5621   }
5622 }
5623 
5624 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5625                                               int masklen, int masksize, int vec_enc) {
5626   assert(VM_Version::supports_popcnt(), "");
5627 
5628   if(VM_Version::supports_avx512bw()) {
5629     kmovql(tmp, mask);
5630   } else {
5631     assert(masklen <= 16, "");
5632     kmovwl(tmp, mask);
5633   }
5634 
5635   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5636   // operations needs to be clipped.
5637   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5638     andq(tmp, (1 << masklen) - 1);
5639   }
5640 
5641   vector_mask_operation_helper(opc, dst, tmp, masklen);
5642 }
5643 
5644 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5645                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5646   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5647          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5648   assert(VM_Version::supports_popcnt(), "");
5649 
5650   bool need_clip = false;
5651   switch(bt) {
5652     case T_BOOLEAN:
5653       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5654       vpxor(xtmp, xtmp, xtmp, vec_enc);
5655       vpsubb(xtmp, xtmp, mask, vec_enc);
5656       vpmovmskb(tmp, xtmp, vec_enc);
5657       need_clip = masklen < 16;
5658       break;
5659     case T_BYTE:
5660       vpmovmskb(tmp, mask, vec_enc);
5661       need_clip = masklen < 16;
5662       break;
5663     case T_SHORT:
5664       vpacksswb(xtmp, mask, mask, vec_enc);
5665       if (masklen >= 16) {
5666         vpermpd(xtmp, xtmp, 8, vec_enc);
5667       }
5668       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5669       need_clip = masklen < 16;
5670       break;
5671     case T_INT:
5672     case T_FLOAT:
5673       vmovmskps(tmp, mask, vec_enc);
5674       need_clip = masklen < 4;
5675       break;
5676     case T_LONG:
5677     case T_DOUBLE:
5678       vmovmskpd(tmp, mask, vec_enc);
5679       need_clip = masklen < 2;
5680       break;
5681     default: assert(false, "Unhandled type, %s", type2name(bt));
5682   }
5683 
5684   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5685   // operations needs to be clipped.
5686   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5687     // need_clip implies masklen < 32
5688     andq(tmp, (1 << masklen) - 1);
5689   }
5690 
5691   vector_mask_operation_helper(opc, dst, tmp, masklen);
5692 }
5693 
5694 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5695                                              Register rtmp2, int mask_len) {
5696   kmov(rtmp1, src);
5697   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5698   mov64(rtmp2, -1L);
5699   pextq(rtmp2, rtmp2, rtmp1);
5700   kmov(dst, rtmp2);
5701 }
5702 
5703 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5704                                                     XMMRegister mask, Register rtmp, Register rscratch,
5705                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5706                                                     int vec_enc) {
5707   assert(type2aelembytes(bt) >= 4, "");
5708   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5709   address compress_perm_table = nullptr;
5710   address expand_perm_table = nullptr;
5711   if (type2aelembytes(bt) == 8) {
5712     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5713     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5714     vmovmskpd(rtmp, mask, vec_enc);
5715   } else {
5716     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5717     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5718     vmovmskps(rtmp, mask, vec_enc);
5719   }
5720   shlq(rtmp, 5); // for 32 byte permute row.
5721   if (opcode == Op_CompressV) {
5722     lea(rscratch, ExternalAddress(compress_perm_table));
5723   } else {
5724     lea(rscratch, ExternalAddress(expand_perm_table));
5725   }
5726   addptr(rtmp, rscratch);
5727   vmovdqu(permv, Address(rtmp));
5728   vpermps(dst, permv, src, Assembler::AVX_256bit);
5729   vpxor(xtmp, xtmp, xtmp, vec_enc);
5730   // Blend the result with zero vector using permute mask, each column entry
5731   // in a permute table row contains either a valid permute index or a -1 (default)
5732   // value, this can potentially be used as a blending mask after
5733   // compressing/expanding the source vector lanes.
5734   vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv);
5735 }
5736 
5737 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5738                                                bool merge, BasicType bt, int vec_enc) {
5739   if (opcode == Op_CompressV) {
5740     switch(bt) {
5741     case T_BYTE:
5742       evpcompressb(dst, mask, src, merge, vec_enc);
5743       break;
5744     case T_CHAR:
5745     case T_SHORT:
5746       evpcompressw(dst, mask, src, merge, vec_enc);
5747       break;
5748     case T_INT:
5749       evpcompressd(dst, mask, src, merge, vec_enc);
5750       break;
5751     case T_FLOAT:
5752       evcompressps(dst, mask, src, merge, vec_enc);
5753       break;
5754     case T_LONG:
5755       evpcompressq(dst, mask, src, merge, vec_enc);
5756       break;
5757     case T_DOUBLE:
5758       evcompresspd(dst, mask, src, merge, vec_enc);
5759       break;
5760     default:
5761       fatal("Unsupported type %s", type2name(bt));
5762       break;
5763     }
5764   } else {
5765     assert(opcode == Op_ExpandV, "");
5766     switch(bt) {
5767     case T_BYTE:
5768       evpexpandb(dst, mask, src, merge, vec_enc);
5769       break;
5770     case T_CHAR:
5771     case T_SHORT:
5772       evpexpandw(dst, mask, src, merge, vec_enc);
5773       break;
5774     case T_INT:
5775       evpexpandd(dst, mask, src, merge, vec_enc);
5776       break;
5777     case T_FLOAT:
5778       evexpandps(dst, mask, src, merge, vec_enc);
5779       break;
5780     case T_LONG:
5781       evpexpandq(dst, mask, src, merge, vec_enc);
5782       break;
5783     case T_DOUBLE:
5784       evexpandpd(dst, mask, src, merge, vec_enc);
5785       break;
5786     default:
5787       fatal("Unsupported type %s", type2name(bt));
5788       break;
5789     }
5790   }
5791 }
5792 
5793 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5794                                            KRegister ktmp1, int vec_enc) {
5795   if (opcode == Op_SignumVD) {
5796     vsubpd(dst, zero, one, vec_enc);
5797     // if src < 0 ? -1 : 1
5798     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5799     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5800     // if src == NaN, -0.0 or 0.0 return src.
5801     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5802     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5803   } else {
5804     assert(opcode == Op_SignumVF, "");
5805     vsubps(dst, zero, one, vec_enc);
5806     // if src < 0 ? -1 : 1
5807     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5808     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5809     // if src == NaN, -0.0 or 0.0 return src.
5810     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5811     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5812   }
5813 }
5814 
5815 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5816                                           XMMRegister xtmp1, int vec_enc) {
5817   if (opcode == Op_SignumVD) {
5818     vsubpd(dst, zero, one, vec_enc);
5819     // if src < 0 ? -1 : 1
5820     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5821     // if src == NaN, -0.0 or 0.0 return src.
5822     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5823     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5824   } else {
5825     assert(opcode == Op_SignumVF, "");
5826     vsubps(dst, zero, one, vec_enc);
5827     // if src < 0 ? -1 : 1
5828     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5829     // if src == NaN, -0.0 or 0.0 return src.
5830     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5831     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5832   }
5833 }
5834 
5835 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5836   if (VM_Version::supports_avx512bw()) {
5837     if (mask_len > 32) {
5838       kmovql(dst, src);
5839     } else {
5840       kmovdl(dst, src);
5841       if (mask_len != 32) {
5842         kshiftrdl(dst, dst, 32 - mask_len);
5843       }
5844     }
5845   } else {
5846     assert(mask_len <= 16, "");
5847     kmovwl(dst, src);
5848     if (mask_len != 16) {
5849       kshiftrwl(dst, dst, 16 - mask_len);
5850     }
5851   }
5852 }
5853 
5854 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5855   int lane_size = type2aelembytes(bt);
5856   if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5857       (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) {
5858     movptr(rtmp, imm32);
5859     switch(lane_size) {
5860       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5861       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5862       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5863       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5864       fatal("Unsupported lane size %d", lane_size);
5865       break;
5866     }
5867   } else {
5868     movptr(rtmp, imm32);
5869     movq(dst, rtmp);
5870     switch(lane_size) {
5871       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5872       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5873       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5874       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5875       fatal("Unsupported lane size %d", lane_size);
5876       break;
5877     }
5878   }
5879 }
5880 
5881 //
5882 // Following is lookup table based popcount computation algorithm:-
5883 //       Index   Bit set count
5884 //     [ 0000 ->   0,
5885 //       0001 ->   1,
5886 //       0010 ->   1,
5887 //       0011 ->   2,
5888 //       0100 ->   1,
5889 //       0101 ->   2,
5890 //       0110 ->   2,
5891 //       0111 ->   3,
5892 //       1000 ->   1,
5893 //       1001 ->   2,
5894 //       1010 ->   3,
5895 //       1011 ->   3,
5896 //       1100 ->   2,
5897 //       1101 ->   3,
5898 //       1111 ->   4 ]
5899 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5900 //     shuffle indices for lookup table access.
5901 //  b. Right shift each byte of vector lane by 4 positions.
5902 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5903 //     shuffle indices for lookup table access.
5904 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5905 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5906 //     count of all the bytes of a quadword.
5907 //  f. Perform step e. for upper 128bit vector lane.
5908 //  g. Pack the bitset count of quadwords back to double word.
5909 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5910 
5911 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5912                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5913   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5914   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5915   vpsrlw(dst, src, 4, vec_enc);
5916   vpand(dst, dst, xtmp1, vec_enc);
5917   vpand(xtmp1, src, xtmp1, vec_enc);
5918   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5919   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5920   vpshufb(dst, xtmp2, dst, vec_enc);
5921   vpaddb(dst, dst, xtmp1, vec_enc);
5922 }
5923 
5924 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5925                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5926   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5927   // Following code is as per steps e,f,g and h of above algorithm.
5928   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5929   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5930   vpsadbw(dst, dst, xtmp2, vec_enc);
5931   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5932   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5933   vpackuswb(dst, xtmp1, dst, vec_enc);
5934 }
5935 
5936 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5937                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5938   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5939   // Add the popcount of upper and lower bytes of word.
5940   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5941   vpsrlw(dst, xtmp1, 8, vec_enc);
5942   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5943   vpaddw(dst, dst, xtmp1, vec_enc);
5944 }
5945 
5946 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5947                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5948   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5949   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5950   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5951 }
5952 
5953 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5954                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5955   switch(bt) {
5956     case T_LONG:
5957       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5958       break;
5959     case T_INT:
5960       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5961       break;
5962     case T_CHAR:
5963     case T_SHORT:
5964       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5965       break;
5966     case T_BYTE:
5967     case T_BOOLEAN:
5968       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5969       break;
5970     default:
5971       fatal("Unsupported type %s", type2name(bt));
5972       break;
5973   }
5974 }
5975 
5976 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5977                                                       KRegister mask, bool merge, int vec_enc) {
5978   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5979   switch(bt) {
5980     case T_LONG:
5981       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5982       evpopcntq(dst, mask, src, merge, vec_enc);
5983       break;
5984     case T_INT:
5985       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5986       evpopcntd(dst, mask, src, merge, vec_enc);
5987       break;
5988     case T_CHAR:
5989     case T_SHORT:
5990       assert(VM_Version::supports_avx512_bitalg(), "");
5991       evpopcntw(dst, mask, src, merge, vec_enc);
5992       break;
5993     case T_BYTE:
5994     case T_BOOLEAN:
5995       assert(VM_Version::supports_avx512_bitalg(), "");
5996       evpopcntb(dst, mask, src, merge, vec_enc);
5997       break;
5998     default:
5999       fatal("Unsupported type %s", type2name(bt));
6000       break;
6001   }
6002 }
6003 
6004 // Bit reversal algorithm first reverses the bits of each byte followed by
6005 // a byte level reversal for multi-byte primitive types (short/int/long).
6006 // Algorithm performs a lookup table access to get reverse bit sequence
6007 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
6008 // is obtained by swapping the reverse bit sequences of upper and lower
6009 // nibble of a byte.
6010 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6011                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
6012   if (VM_Version::supports_avx512vlbw()) {
6013 
6014     // Get the reverse bit sequence of lower nibble of each byte.
6015     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
6016     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6017     evpandq(dst, xtmp2, src, vec_enc);
6018     vpshufb(dst, xtmp1, dst, vec_enc);
6019     vpsllq(dst, dst, 4, vec_enc);
6020 
6021     // Get the reverse bit sequence of upper nibble of each byte.
6022     vpandn(xtmp2, xtmp2, src, vec_enc);
6023     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6024     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6025 
6026     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6027     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6028     evporq(xtmp2, dst, xtmp2, vec_enc);
6029     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6030 
6031   } else if(vec_enc == Assembler::AVX_512bit) {
6032     // Shift based bit reversal.
6033     assert(bt == T_LONG || bt == T_INT, "");
6034 
6035     // Swap lower and upper nibble of each byte.
6036     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
6037 
6038     // Swap two least and most significant bits of each nibble.
6039     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
6040 
6041     // Swap adjacent pair of bits.
6042     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6043     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
6044 
6045     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6046     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
6047   } else {
6048     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
6049     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6050 
6051     // Get the reverse bit sequence of lower nibble of each byte.
6052     vpand(dst, xtmp2, src, vec_enc);
6053     vpshufb(dst, xtmp1, dst, vec_enc);
6054     vpsllq(dst, dst, 4, vec_enc);
6055 
6056     // Get the reverse bit sequence of upper nibble of each byte.
6057     vpandn(xtmp2, xtmp2, src, vec_enc);
6058     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6059     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6060 
6061     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6062     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6063     vpor(xtmp2, dst, xtmp2, vec_enc);
6064     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6065   }
6066 }
6067 
6068 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
6069                                                 XMMRegister xtmp, Register rscratch) {
6070   assert(VM_Version::supports_gfni(), "");
6071   assert(rscratch != noreg || always_reachable(mask), "missing");
6072 
6073   // Galois field instruction based bit reversal based on following algorithm.
6074   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6075   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
6076   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
6077   vector_reverse_byte(bt, dst, xtmp, vec_enc);
6078 }
6079 
6080 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
6081                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
6082   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
6083   evpandq(dst, xtmp1, src, vec_enc);
6084   vpsllq(dst, dst, nbits, vec_enc);
6085   vpandn(xtmp1, xtmp1, src, vec_enc);
6086   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
6087   evporq(dst, dst, xtmp1, vec_enc);
6088 }
6089 
6090 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6091                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
6092   // Shift based bit reversal.
6093   assert(VM_Version::supports_evex(), "");
6094   switch(bt) {
6095     case T_LONG:
6096       // Swap upper and lower double word of each quad word.
6097       evprorq(xtmp1, k0, src, 32, true, vec_enc);
6098       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
6099       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6100       break;
6101     case T_INT:
6102       // Swap upper and lower word of each double word.
6103       evprord(xtmp1, k0, src, 16, true, vec_enc);
6104       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6105       break;
6106     case T_CHAR:
6107     case T_SHORT:
6108       // Swap upper and lower byte of each word.
6109       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6110       break;
6111     case T_BYTE:
6112       evmovdquq(dst, k0, src, true, vec_enc);
6113       break;
6114     default:
6115       fatal("Unsupported type %s", type2name(bt));
6116       break;
6117   }
6118 }
6119 
6120 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6121   if (bt == T_BYTE) {
6122     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6123       evmovdquq(dst, k0, src, true, vec_enc);
6124     } else {
6125       vmovdqu(dst, src);
6126     }
6127     return;
6128   }
6129   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6130   // pre-computed shuffle indices.
6131   switch(bt) {
6132     case T_LONG:
6133       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6134       break;
6135     case T_INT:
6136       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6137       break;
6138     case T_CHAR:
6139     case T_SHORT:
6140       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6141       break;
6142     default:
6143       fatal("Unsupported type %s", type2name(bt));
6144       break;
6145   }
6146   vpshufb(dst, src, dst, vec_enc);
6147 }
6148 
6149 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6150                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6151                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6152   assert(is_integral_type(bt), "");
6153   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6154   assert(VM_Version::supports_avx512cd(), "");
6155   switch(bt) {
6156     case T_LONG:
6157       evplzcntq(dst, ktmp, src, merge, vec_enc);
6158       break;
6159     case T_INT:
6160       evplzcntd(dst, ktmp, src, merge, vec_enc);
6161       break;
6162     case T_SHORT:
6163       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6164       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6165       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6166       vpunpckhwd(dst, xtmp1, src, vec_enc);
6167       evplzcntd(dst, ktmp, dst, merge, vec_enc);
6168       vpackusdw(dst, xtmp2, dst, vec_enc);
6169       break;
6170     case T_BYTE:
6171       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6172       // accessing the lookup table.
6173       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6174       // accessing the lookup table.
6175       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6176       assert(VM_Version::supports_avx512bw(), "");
6177       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6178       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6179       vpand(xtmp2, dst, src, vec_enc);
6180       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6181       vpsrlw(xtmp3, src, 4, vec_enc);
6182       vpand(xtmp3, dst, xtmp3, vec_enc);
6183       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6184       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6185       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6186       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6187       break;
6188     default:
6189       fatal("Unsupported type %s", type2name(bt));
6190       break;
6191   }
6192 }
6193 
6194 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6195                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6196   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6197   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6198   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6199   // accessing the lookup table.
6200   vpand(dst, xtmp2, src, vec_enc);
6201   vpshufb(dst, xtmp1, dst, vec_enc);
6202   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6203   // accessing the lookup table.
6204   vpsrlw(xtmp3, src, 4, vec_enc);
6205   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6206   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6207   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6208   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6209   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6210   vpaddb(dst, dst, xtmp2, vec_enc);
6211   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6212 }
6213 
6214 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6215                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6216   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6217   // Add zero counts of lower byte and upper byte of a word if
6218   // upper byte holds a zero value.
6219   vpsrlw(xtmp3, src, 8, vec_enc);
6220   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6221   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6222   vpsllw(xtmp2, dst, 8, vec_enc);
6223   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6224   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6225   vpsrlw(dst, dst, 8, vec_enc);
6226 }
6227 
6228 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6229                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6230   // Since IEEE 754 floating point format represents mantissa in 1.0 format
6231   // hence biased exponent can be used to compute leading zero count as per
6232   // following formula:-
6233   // LZCNT = 31 - (biased_exp - 127)
6234   // Special handling has been introduced for Zero, Max_Int and -ve source values.
6235 
6236   // Broadcast 0xFF
6237   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6238   vpsrld(xtmp1, xtmp1, 24, vec_enc);
6239 
6240   // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher
6241   // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit
6242   // contributes to the leading number of zeros.
6243   vpsrld(xtmp2, src, 1, vec_enc);
6244   vpandn(xtmp3, xtmp2, src, vec_enc);
6245 
6246   // Extract biased exponent.
6247   vcvtdq2ps(dst, xtmp3, vec_enc);
6248   vpsrld(dst, dst, 23, vec_enc);
6249   vpand(dst, dst, xtmp1, vec_enc);
6250 
6251   // Broadcast 127.
6252   vpsrld(xtmp1, xtmp1, 1, vec_enc);
6253   // Exponent = biased_exp - 127
6254   vpsubd(dst, dst, xtmp1, vec_enc);
6255 
6256   // Exponent_plus_one = Exponent + 1
6257   vpsrld(xtmp3, xtmp1, 6, vec_enc);
6258   vpaddd(dst, dst, xtmp3, vec_enc);
6259 
6260   // Replace -ve exponent with zero, exponent is -ve when src
6261   // lane contains a zero value.
6262   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6263   vblendvps(dst, dst, xtmp2, dst, vec_enc);
6264 
6265   // Rematerialize broadcast 32.
6266   vpslld(xtmp1, xtmp3, 5, vec_enc);
6267   // Exponent is 32 if corresponding source lane contains max_int value.
6268   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
6269   // LZCNT = 32 - exponent_plus_one
6270   vpsubd(dst, xtmp1, dst, vec_enc);
6271 
6272   // Replace LZCNT with a value 1 if corresponding source lane
6273   // contains max_int value.
6274   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
6275 
6276   // Replace biased_exp with 0 if source lane value is less than zero.
6277   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6278   vblendvps(dst, dst, xtmp2, src, vec_enc);
6279 }
6280 
6281 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6282                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6283   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6284   // Add zero counts of lower word and upper word of a double word if
6285   // upper word holds a zero value.
6286   vpsrld(xtmp3, src, 16, vec_enc);
6287   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6288   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
6289   vpslld(xtmp2, dst, 16, vec_enc);
6290   vpaddd(xtmp2, xtmp2, dst, vec_enc);
6291   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6292   vpsrld(dst, dst, 16, vec_enc);
6293   // Add zero counts of lower doubleword and upper doubleword of a
6294   // quadword if upper doubleword holds a zero value.
6295   vpsrlq(xtmp3, src, 32, vec_enc);
6296   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
6297   vpsllq(xtmp2, dst, 32, vec_enc);
6298   vpaddq(xtmp2, xtmp2, dst, vec_enc);
6299   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6300   vpsrlq(dst, dst, 32, vec_enc);
6301 }
6302 
6303 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6304                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6305                                                        Register rtmp, int vec_enc) {
6306   assert(is_integral_type(bt), "unexpected type");
6307   assert(vec_enc < Assembler::AVX_512bit, "");
6308   switch(bt) {
6309     case T_LONG:
6310       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6311       break;
6312     case T_INT:
6313       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6314       break;
6315     case T_SHORT:
6316       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6317       break;
6318     case T_BYTE:
6319       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6320       break;
6321     default:
6322       fatal("Unsupported type %s", type2name(bt));
6323       break;
6324   }
6325 }
6326 
6327 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6328   switch(bt) {
6329     case T_BYTE:
6330       vpsubb(dst, src1, src2, vec_enc);
6331       break;
6332     case T_SHORT:
6333       vpsubw(dst, src1, src2, vec_enc);
6334       break;
6335     case T_INT:
6336       vpsubd(dst, src1, src2, vec_enc);
6337       break;
6338     case T_LONG:
6339       vpsubq(dst, src1, src2, vec_enc);
6340       break;
6341     default:
6342       fatal("Unsupported type %s", type2name(bt));
6343       break;
6344   }
6345 }
6346 
6347 // Trailing zero count computation is based on leading zero count operation as per
6348 // following equation. All AVX3 targets support AVX512CD feature which offers
6349 // direct vector instruction to compute leading zero count.
6350 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6351 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6352                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6353                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6354   assert(is_integral_type(bt), "");
6355   // xtmp = -1
6356   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6357   // xtmp = xtmp + src
6358   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6359   // xtmp = xtmp & ~src
6360   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6361   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6362   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6363   vpsub(bt, dst, xtmp4, dst, vec_enc);
6364 }
6365 
6366 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6367 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6368 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6369                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6370   assert(is_integral_type(bt), "");
6371   // xtmp = 0
6372   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6373   // xtmp = 0 - src
6374   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6375   // xtmp = xtmp | src
6376   vpor(xtmp3, xtmp3, src, vec_enc);
6377   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6378   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6379   vpsub(bt, dst, xtmp1, dst, vec_enc);
6380 }
6381 
6382 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6383   Label done;
6384   Label neg_divisor_fastpath;
6385   cmpl(divisor, 0);
6386   jccb(Assembler::less, neg_divisor_fastpath);
6387   xorl(rdx, rdx);
6388   divl(divisor);
6389   jmpb(done);
6390   bind(neg_divisor_fastpath);
6391   // Fastpath for divisor < 0:
6392   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6393   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6394   movl(rdx, rax);
6395   subl(rdx, divisor);
6396   if (VM_Version::supports_bmi1()) {
6397     andnl(rax, rdx, rax);
6398   } else {
6399     notl(rdx);
6400     andl(rax, rdx);
6401   }
6402   shrl(rax, 31);
6403   bind(done);
6404 }
6405 
6406 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6407   Label done;
6408   Label neg_divisor_fastpath;
6409   cmpl(divisor, 0);
6410   jccb(Assembler::less, neg_divisor_fastpath);
6411   xorl(rdx, rdx);
6412   divl(divisor);
6413   jmpb(done);
6414   bind(neg_divisor_fastpath);
6415   // Fastpath when divisor < 0:
6416   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6417   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6418   movl(rdx, rax);
6419   subl(rax, divisor);
6420   if (VM_Version::supports_bmi1()) {
6421     andnl(rax, rax, rdx);
6422   } else {
6423     notl(rax);
6424     andl(rax, rdx);
6425   }
6426   sarl(rax, 31);
6427   andl(rax, divisor);
6428   subl(rdx, rax);
6429   bind(done);
6430 }
6431 
6432 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6433   Label done;
6434   Label neg_divisor_fastpath;
6435 
6436   cmpl(divisor, 0);
6437   jccb(Assembler::less, neg_divisor_fastpath);
6438   xorl(rdx, rdx);
6439   divl(divisor);
6440   jmpb(done);
6441   bind(neg_divisor_fastpath);
6442   // Fastpath for divisor < 0:
6443   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6444   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6445   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6446   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6447   movl(rdx, rax);
6448   subl(rax, divisor);
6449   if (VM_Version::supports_bmi1()) {
6450     andnl(rax, rax, rdx);
6451   } else {
6452     notl(rax);
6453     andl(rax, rdx);
6454   }
6455   movl(tmp, rax);
6456   shrl(rax, 31); // quotient
6457   sarl(tmp, 31);
6458   andl(tmp, divisor);
6459   subl(rdx, tmp); // remainder
6460   bind(done);
6461 }
6462 
6463 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6464                                  XMMRegister xtmp2, Register rtmp) {
6465   if(VM_Version::supports_gfni()) {
6466     // Galois field instruction based bit reversal based on following algorithm.
6467     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6468     mov64(rtmp, 0x8040201008040201L);
6469     movq(xtmp1, src);
6470     movq(xtmp2, rtmp);
6471     gf2p8affineqb(xtmp1, xtmp2, 0);
6472     movq(dst, xtmp1);
6473   } else {
6474     // Swap even and odd numbered bits.
6475     movl(rtmp, src);
6476     andl(rtmp, 0x55555555);
6477     shll(rtmp, 1);
6478     movl(dst, src);
6479     andl(dst, 0xAAAAAAAA);
6480     shrl(dst, 1);
6481     orl(dst, rtmp);
6482 
6483     // Swap LSB and MSB 2 bits of each nibble.
6484     movl(rtmp, dst);
6485     andl(rtmp, 0x33333333);
6486     shll(rtmp, 2);
6487     andl(dst, 0xCCCCCCCC);
6488     shrl(dst, 2);
6489     orl(dst, rtmp);
6490 
6491     // Swap LSB and MSB 4 bits of each byte.
6492     movl(rtmp, dst);
6493     andl(rtmp, 0x0F0F0F0F);
6494     shll(rtmp, 4);
6495     andl(dst, 0xF0F0F0F0);
6496     shrl(dst, 4);
6497     orl(dst, rtmp);
6498   }
6499   bswapl(dst);
6500 }
6501 
6502 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6503                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6504   if(VM_Version::supports_gfni()) {
6505     // Galois field instruction based bit reversal based on following algorithm.
6506     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6507     mov64(rtmp1, 0x8040201008040201L);
6508     movq(xtmp1, src);
6509     movq(xtmp2, rtmp1);
6510     gf2p8affineqb(xtmp1, xtmp2, 0);
6511     movq(dst, xtmp1);
6512   } else {
6513     // Swap even and odd numbered bits.
6514     movq(rtmp1, src);
6515     mov64(rtmp2, 0x5555555555555555L);
6516     andq(rtmp1, rtmp2);
6517     shlq(rtmp1, 1);
6518     movq(dst, src);
6519     notq(rtmp2);
6520     andq(dst, rtmp2);
6521     shrq(dst, 1);
6522     orq(dst, rtmp1);
6523 
6524     // Swap LSB and MSB 2 bits of each nibble.
6525     movq(rtmp1, dst);
6526     mov64(rtmp2, 0x3333333333333333L);
6527     andq(rtmp1, rtmp2);
6528     shlq(rtmp1, 2);
6529     notq(rtmp2);
6530     andq(dst, rtmp2);
6531     shrq(dst, 2);
6532     orq(dst, rtmp1);
6533 
6534     // Swap LSB and MSB 4 bits of each byte.
6535     movq(rtmp1, dst);
6536     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6537     andq(rtmp1, rtmp2);
6538     shlq(rtmp1, 4);
6539     notq(rtmp2);
6540     andq(dst, rtmp2);
6541     shrq(dst, 4);
6542     orq(dst, rtmp1);
6543   }
6544   bswapq(dst);
6545 }
6546 
6547 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6548   Label done;
6549   Label neg_divisor_fastpath;
6550   cmpq(divisor, 0);
6551   jccb(Assembler::less, neg_divisor_fastpath);
6552   xorl(rdx, rdx);
6553   divq(divisor);
6554   jmpb(done);
6555   bind(neg_divisor_fastpath);
6556   // Fastpath for divisor < 0:
6557   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6558   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6559   movq(rdx, rax);
6560   subq(rdx, divisor);
6561   if (VM_Version::supports_bmi1()) {
6562     andnq(rax, rdx, rax);
6563   } else {
6564     notq(rdx);
6565     andq(rax, rdx);
6566   }
6567   shrq(rax, 63);
6568   bind(done);
6569 }
6570 
6571 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6572   Label done;
6573   Label neg_divisor_fastpath;
6574   cmpq(divisor, 0);
6575   jccb(Assembler::less, neg_divisor_fastpath);
6576   xorq(rdx, rdx);
6577   divq(divisor);
6578   jmp(done);
6579   bind(neg_divisor_fastpath);
6580   // Fastpath when divisor < 0:
6581   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6582   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6583   movq(rdx, rax);
6584   subq(rax, divisor);
6585   if (VM_Version::supports_bmi1()) {
6586     andnq(rax, rax, rdx);
6587   } else {
6588     notq(rax);
6589     andq(rax, rdx);
6590   }
6591   sarq(rax, 63);
6592   andq(rax, divisor);
6593   subq(rdx, rax);
6594   bind(done);
6595 }
6596 
6597 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6598   Label done;
6599   Label neg_divisor_fastpath;
6600   cmpq(divisor, 0);
6601   jccb(Assembler::less, neg_divisor_fastpath);
6602   xorq(rdx, rdx);
6603   divq(divisor);
6604   jmp(done);
6605   bind(neg_divisor_fastpath);
6606   // Fastpath for divisor < 0:
6607   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6608   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6609   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6610   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6611   movq(rdx, rax);
6612   subq(rax, divisor);
6613   if (VM_Version::supports_bmi1()) {
6614     andnq(rax, rax, rdx);
6615   } else {
6616     notq(rax);
6617     andq(rax, rdx);
6618   }
6619   movq(tmp, rax);
6620   shrq(rax, 63); // quotient
6621   sarq(tmp, 63);
6622   andq(tmp, divisor);
6623   subq(rdx, tmp); // remainder
6624   bind(done);
6625 }
6626 
6627 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6628                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6629                                         int vlen_enc) {
6630   assert(VM_Version::supports_avx512bw(), "");
6631   // Byte shuffles are inlane operations and indices are determined using
6632   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6633   // normalized to index range 0-15. This makes sure that all the multiples
6634   // of an index value are placed at same relative position in 128 bit
6635   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6636   // will be 16th element in their respective 128 bit lanes.
6637   movl(rtmp, 16);
6638   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6639 
6640   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6641   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6642   // original shuffle indices and move the shuffled lanes corresponding to true
6643   // mask to destination vector.
6644   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6645   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6646   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6647 
6648   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6649   // and broadcasting second 128 bit lane.
6650   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6651   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6652   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6653   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6654   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6655 
6656   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6657   // and broadcasting third 128 bit lane.
6658   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6659   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6660   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6661   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6662   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6663 
6664   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6665   // and broadcasting third 128 bit lane.
6666   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6667   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6668   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6669   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6670   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6671 }
6672 
6673 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6674                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6675   if (vlen_enc == AVX_128bit) {
6676     vpermilps(dst, src, shuffle, vlen_enc);
6677   } else if (bt == T_INT) {
6678     vpermd(dst, shuffle, src, vlen_enc);
6679   } else {
6680     assert(bt == T_FLOAT, "");
6681     vpermps(dst, shuffle, src, vlen_enc);
6682   }
6683 }
6684 
6685 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6686   switch(opcode) {
6687     case Op_AddHF: vaddsh(dst, src1, src2); break;
6688     case Op_SubHF: vsubsh(dst, src1, src2); break;
6689     case Op_MulHF: vmulsh(dst, src1, src2); break;
6690     case Op_DivHF: vdivsh(dst, src1, src2); break;
6691     default: assert(false, "%s", NodeClassNames[opcode]); break;
6692   }
6693 }
6694 
6695 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6696   switch(elem_bt) {
6697     case T_BYTE:
6698       if (ideal_opc == Op_SaturatingAddV) {
6699         vpaddsb(dst, src1, src2, vlen_enc);
6700       } else {
6701         assert(ideal_opc == Op_SaturatingSubV, "");
6702         vpsubsb(dst, src1, src2, vlen_enc);
6703       }
6704       break;
6705     case T_SHORT:
6706       if (ideal_opc == Op_SaturatingAddV) {
6707         vpaddsw(dst, src1, src2, vlen_enc);
6708       } else {
6709         assert(ideal_opc == Op_SaturatingSubV, "");
6710         vpsubsw(dst, src1, src2, vlen_enc);
6711       }
6712       break;
6713     default:
6714       fatal("Unsupported type %s", type2name(elem_bt));
6715       break;
6716   }
6717 }
6718 
6719 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6720   switch(elem_bt) {
6721     case T_BYTE:
6722       if (ideal_opc == Op_SaturatingAddV) {
6723         vpaddusb(dst, src1, src2, vlen_enc);
6724       } else {
6725         assert(ideal_opc == Op_SaturatingSubV, "");
6726         vpsubusb(dst, src1, src2, vlen_enc);
6727       }
6728       break;
6729     case T_SHORT:
6730       if (ideal_opc == Op_SaturatingAddV) {
6731         vpaddusw(dst, src1, src2, vlen_enc);
6732       } else {
6733         assert(ideal_opc == Op_SaturatingSubV, "");
6734         vpsubusw(dst, src1, src2, vlen_enc);
6735       }
6736       break;
6737     default:
6738       fatal("Unsupported type %s", type2name(elem_bt));
6739       break;
6740   }
6741 }
6742 
6743 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6744                                                               XMMRegister src2, KRegister ktmp, int vlen_enc) {
6745   // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6746   // overflow_mask = Inp1 <u Inp2
6747   evpcmpu(elem_bt, ktmp,  src2, src1, Assembler::lt, vlen_enc);
6748   // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6749   evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6750 }
6751 
6752 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6753                                                               XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6754   // Emulate unsigned comparison using signed comparison
6755   // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6756   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6757   vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6758   vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6759 
6760   vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6761 
6762   // Res = INP1 - INP2 (non-commutative and non-associative)
6763   vpsub(elem_bt, dst, src1, src2, vlen_enc);
6764   // Res = Mask ? Zero : Res
6765   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6766   vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6767 }
6768 
6769 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6770                                                                XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6771   // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6772   // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6773   // Res = Signed Add INP1, INP2
6774   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6775   // T1 = SRC1 | SRC2
6776   vpor(xtmp1, src1, src2, vlen_enc);
6777   // Max_Unsigned = -1
6778   vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6779   // Unsigned compare:  Mask = Res <u T1
6780   evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6781   // res  = Mask ? Max_Unsigned : Res
6782   evpblend(elem_bt, dst, ktmp,  dst, xtmp2, true, vlen_enc);
6783 }
6784 
6785 //
6786 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6787 // unsigned addition operation.
6788 //    overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6789 //
6790 // We empirically determined its semantic equivalence to following reduced expression
6791 //    overflow_mask =  (a + b) <u (a | b)
6792 //
6793 // and also verified it though Alive2 solver.
6794 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6795 //
6796 
6797 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6798                                                               XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6799   // Res = Signed Add INP1, INP2
6800   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6801   // Compute T1 = INP1 | INP2
6802   vpor(xtmp3, src1, src2, vlen_enc);
6803   // T1 = Minimum signed value.
6804   vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6805   // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6806   vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6807   // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6808   vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6809   // Compute overflow detection mask = Res<1> <s T1
6810   if (elem_bt == T_INT) {
6811     vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6812   } else {
6813     assert(elem_bt == T_LONG, "");
6814     vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6815   }
6816   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6817 }
6818 
6819 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6820                                       int vlen_enc, bool xtmp2_hold_M1) {
6821   if (VM_Version::supports_avx512dq()) {
6822     evpmovq2m(ktmp, src, vlen_enc);
6823   } else {
6824     assert(VM_Version::supports_evex(), "");
6825     if (!xtmp2_hold_M1) {
6826       vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6827     }
6828     evpsraq(xtmp1, src, 63, vlen_enc);
6829     evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6830   }
6831 }
6832 
6833 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6834                                       int vlen_enc, bool xtmp2_hold_M1) {
6835   if (VM_Version::supports_avx512dq()) {
6836     evpmovd2m(ktmp, src, vlen_enc);
6837   } else {
6838     assert(VM_Version::supports_evex(), "");
6839     if (!xtmp2_hold_M1) {
6840       vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6841     }
6842     vpsrad(xtmp1, src, 31, vlen_enc);
6843     Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6844   }
6845 }
6846 
6847 
6848 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6849   if (elem_bt == T_LONG) {
6850     if (VM_Version::supports_evex()) {
6851       evpsraq(dst, src, 63, vlen_enc);
6852     } else {
6853       vpsrad(dst, src, 31, vlen_enc);
6854       vpshufd(dst, dst, 0xF5, vlen_enc);
6855     }
6856   } else {
6857     assert(elem_bt == T_INT, "");
6858     vpsrad(dst, src, 31, vlen_enc);
6859   }
6860 }
6861 
6862 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6863   if (compute_allones) {
6864     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6865       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6866     } else {
6867       vpcmpeqq(allones, allones, allones, vlen_enc);
6868     }
6869   }
6870   if (elem_bt == T_LONG) {
6871     vpsrlq(dst, allones, 1, vlen_enc);
6872   } else {
6873     assert(elem_bt == T_INT, "");
6874     vpsrld(dst, allones, 1, vlen_enc);
6875   }
6876 }
6877 
6878 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6879   if (compute_allones) {
6880     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6881       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6882     } else {
6883       vpcmpeqq(allones, allones, allones, vlen_enc);
6884     }
6885   }
6886   if (elem_bt == T_LONG) {
6887     vpsllq(dst, allones, 63, vlen_enc);
6888   } else {
6889     assert(elem_bt == T_INT, "");
6890     vpslld(dst, allones, 31, vlen_enc);
6891   }
6892 }
6893 
6894 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask,  XMMRegister src1, XMMRegister src2,
6895                                 Assembler::ComparisonPredicate cond, int vlen_enc) {
6896   switch(elem_bt) {
6897     case T_LONG:  evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6898     case T_INT:   evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6899     case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6900     case T_BYTE:  evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6901     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6902   }
6903 }
6904 
6905 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6906   switch(elem_bt) {
6907     case  T_LONG:  vpcmpgtq(dst, src1, src2, vlen_enc); break;
6908     case  T_INT:   vpcmpgtd(dst, src1, src2, vlen_enc); break;
6909     case  T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6910     case  T_BYTE:  vpcmpgtb(dst, src1, src2, vlen_enc); break;
6911     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6912   }
6913 }
6914 
6915 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6916                                            XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6917   if (elem_bt == T_LONG) {
6918     evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6919   } else {
6920     assert(elem_bt == T_INT, "");
6921     evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6922   }
6923 }
6924 
6925 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6926                                                          XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6927                                                          KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6928   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6929   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6930   // Overflow detection based on Hacker's delight section 2-13.
6931   if (ideal_opc == Op_SaturatingAddV) {
6932     // res = src1 + src2
6933     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6934     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6935     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6936     vpxor(xtmp1, dst, src1, vlen_enc);
6937     vpxor(xtmp2, dst, src2, vlen_enc);
6938     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6939   } else {
6940     assert(ideal_opc == Op_SaturatingSubV, "");
6941     // res = src1 - src2
6942     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6943     // Overflow occurs when both inputs have opposite polarity and
6944     // result polarity does not comply with first input polarity.
6945     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6946     vpxor(xtmp1, src1, src2, vlen_enc);
6947     vpxor(xtmp2, dst, src1, vlen_enc);
6948     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6949   }
6950 
6951   // Compute overflow detection mask.
6952   evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6953   // Note: xtmp1 hold -1 in all its lanes after above call.
6954 
6955   // Compute mask based on first input polarity.
6956   evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6957 
6958   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6959   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6960 
6961   // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6962   // set bits in first input polarity mask holds a min value.
6963   evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6964   // Blend destination lanes with saturated values using overflow detection mask.
6965   evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6966 }
6967 
6968 
6969 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6970                                                         XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6971                                                         XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6972   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6973   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6974   // Overflow detection based on Hacker's delight section 2-13.
6975   if (ideal_opc == Op_SaturatingAddV) {
6976     // res = src1 + src2
6977     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6978     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6979     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6980     vpxor(xtmp1, dst, src1, vlen_enc);
6981     vpxor(xtmp2, dst, src2, vlen_enc);
6982     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6983   } else {
6984     assert(ideal_opc == Op_SaturatingSubV, "");
6985     // res = src1 - src2
6986     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6987     // Overflow occurs when both inputs have opposite polarity and
6988     // result polarity does not comply with first input polarity.
6989     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6990     vpxor(xtmp1, src1, src2, vlen_enc);
6991     vpxor(xtmp2, dst, src1, vlen_enc);
6992     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6993   }
6994 
6995   // Sign-extend to compute overflow detection mask.
6996   vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
6997 
6998   vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
6999   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
7000   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
7001 
7002   // Compose saturating min/max vector using first input polarity mask.
7003   vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
7004   vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
7005 
7006   // Blend result with saturating vector using overflow detection mask.
7007   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
7008 }
7009 
7010 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7011   switch(elem_bt) {
7012     case T_BYTE:
7013       if (ideal_opc == Op_SaturatingAddV) {
7014         vpaddsb(dst, src1, src2, vlen_enc);
7015       } else {
7016         assert(ideal_opc == Op_SaturatingSubV, "");
7017         vpsubsb(dst, src1, src2, vlen_enc);
7018       }
7019       break;
7020     case T_SHORT:
7021       if (ideal_opc == Op_SaturatingAddV) {
7022         vpaddsw(dst, src1, src2, vlen_enc);
7023       } else {
7024         assert(ideal_opc == Op_SaturatingSubV, "");
7025         vpsubsw(dst, src1, src2, vlen_enc);
7026       }
7027       break;
7028     default:
7029       fatal("Unsupported type %s", type2name(elem_bt));
7030       break;
7031   }
7032 }
7033 
7034 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7035   switch(elem_bt) {
7036     case T_BYTE:
7037       if (ideal_opc == Op_SaturatingAddV) {
7038         vpaddusb(dst, src1, src2, vlen_enc);
7039       } else {
7040         assert(ideal_opc == Op_SaturatingSubV, "");
7041         vpsubusb(dst, src1, src2, vlen_enc);
7042       }
7043       break;
7044     case T_SHORT:
7045       if (ideal_opc == Op_SaturatingAddV) {
7046         vpaddusw(dst, src1, src2, vlen_enc);
7047       } else {
7048         assert(ideal_opc == Op_SaturatingSubV, "");
7049         vpsubusw(dst, src1, src2, vlen_enc);
7050       }
7051       break;
7052     default:
7053       fatal("Unsupported type %s", type2name(elem_bt));
7054       break;
7055   }
7056 }
7057 
7058 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
7059                                                      XMMRegister src2, int vlen_enc) {
7060   switch(elem_bt) {
7061     case T_BYTE:
7062       evpermi2b(dst, src1, src2, vlen_enc);
7063       break;
7064     case T_SHORT:
7065       evpermi2w(dst, src1, src2, vlen_enc);
7066       break;
7067     case T_INT:
7068       evpermi2d(dst, src1, src2, vlen_enc);
7069       break;
7070     case T_LONG:
7071       evpermi2q(dst, src1, src2, vlen_enc);
7072       break;
7073     case T_FLOAT:
7074       evpermi2ps(dst, src1, src2, vlen_enc);
7075       break;
7076     case T_DOUBLE:
7077       evpermi2pd(dst, src1, src2, vlen_enc);
7078       break;
7079     default:
7080       fatal("Unsupported type %s", type2name(elem_bt));
7081       break;
7082   }
7083 }
7084 
7085 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
7086   if (is_unsigned) {
7087     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7088   } else {
7089     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7090   }
7091 }
7092 
7093 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
7094   if (is_unsigned) {
7095     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7096   } else {
7097     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7098   }
7099 }
7100 
7101 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
7102   switch(opcode) {
7103     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7104     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7105     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7106     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7107     default: assert(false, "%s", NodeClassNames[opcode]); break;
7108   }
7109 }
7110 
7111 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7112   switch(opcode) {
7113     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7114     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7115     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7116     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7117     default: assert(false, "%s", NodeClassNames[opcode]); break;
7118   }
7119 }
7120 
7121 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7122                                             KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) {
7123   vector_max_min_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit);
7124 }
7125 
7126 void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7127                                             KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
7128   if (opcode == Op_MaxVHF || opcode == Op_MaxHF) {
7129     // Move sign bits of src2 to mask register.
7130     evpmovw2m(ktmp, src2, vlen_enc);
7131     // xtmp1 = src2 < 0 ? src2 : src1
7132     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7133     // xtmp2 = src2 < 0 ? ? src1 : src2
7134     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7135     // Idea behind above swapping is to make seconds source operand a +ve value.
7136     // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in
7137     // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction,
7138     // the second source operand, either a NaN or a valid floating-point value, is returned
7139     // dst = max(xtmp1, xtmp2)
7140     evmaxph(dst, xtmp1, xtmp2, vlen_enc);
7141     // isNaN = is_unordered_quiet(xtmp1)
7142     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7143     // Final result is same as first source if its a NaN value,
7144     // in case second operand holds a NaN value then as per above semantics
7145     // result is same as second operand.
7146     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7147   } else {
7148     assert(opcode == Op_MinVHF || opcode == Op_MinHF, "");
7149     // Move sign bits of src1 to mask register.
7150     evpmovw2m(ktmp, src1, vlen_enc);
7151     // xtmp1 = src1 < 0 ? src2 : src1
7152     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7153     // xtmp2 = src1 < 0 ? src1 : src2
7154     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7155     // Idea behind above swapping is to make seconds source operand a -ve value.
7156     // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in
7157     // the second source operand is returned.
7158     // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN
7159     // or a valid floating-point value, is written to the result.
7160     // dst = min(xtmp1, xtmp2)
7161     evminph(dst, xtmp1, xtmp2, vlen_enc);
7162     // isNaN = is_unordered_quiet(xtmp1)
7163     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7164     // Final result is same as first source if its a NaN value,
7165     // in case second operand holds a NaN value then as per above semantics
7166     // result is same as second operand.
7167     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7168   }
7169 }