1 /*
   2  * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "gc/shared/barrierSet.hpp"
  28 #include "gc/shared/barrierSetAssembler.hpp"
  29 #include "oops/methodData.hpp"
  30 #include "opto/c2_MacroAssembler.hpp"
  31 #include "opto/intrinsicnode.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/opcodes.hpp"
  34 #include "opto/subnode.hpp"
  35 #include "runtime/globals.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 #include "utilities/checkedCast.hpp"
  39 #include "utilities/globalDefinitions.hpp"
  40 #include "utilities/powerOfTwo.hpp"
  41 #include "utilities/sizes.hpp"
  42 
  43 #ifdef PRODUCT
  44 #define BLOCK_COMMENT(str) /* nothing */
  45 #define STOP(error) stop(error)
  46 #else
  47 #define BLOCK_COMMENT(str) block_comment(str)
  48 #define STOP(error) block_comment(error); stop(error)
  49 #endif
  50 
  51 // C2 compiled method's prolog code.
  52 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) {
  53   if (C->clinit_barrier_on_entry()) {
  54     assert(VM_Version::supports_fast_class_init_checks(), "sanity");
  55     assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started");
  56 
  57     Label L_skip_barrier;
  58     Register klass = rscratch1;
  59 
  60     mov_metadata(klass, C->method()->holder()->constant_encoding());
  61     clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
  62 
  63     jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
  64 
  65     bind(L_skip_barrier);
  66   }
  67 
  68   int framesize = C->output()->frame_size_in_bytes();
  69   int bangsize = C->output()->bang_size_in_bytes();
  70   bool fp_mode_24b = false;
  71   int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0;
  72 
  73   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  74 
  75   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  76   // Remove word for return addr
  77   framesize -= wordSize;
  78   stack_bang_size -= wordSize;
  79 
  80   // Calls to C2R adapters often do not accept exceptional returns.
  81   // We require that their callers must bang for them.  But be careful, because
  82   // some VM calls (such as call site linkage) can use several kilobytes of
  83   // stack.  But the stack safety zone should account for that.
  84   // See bugs 4446381, 4468289, 4497237.
  85   if (stack_bang_size > 0) {
  86     generate_stack_overflow_check(stack_bang_size);
  87 
  88     // We always push rbp, so that on return to interpreter rbp, will be
  89     // restored correctly and we can correct the stack.
  90     push(rbp);
  91     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  92     if (PreserveFramePointer) {
  93       mov(rbp, rsp);
  94     }
  95     // Remove word for ebp
  96     framesize -= wordSize;
  97 
  98     // Create frame
  99     if (framesize) {
 100       subptr(rsp, framesize);
 101     }
 102   } else {
 103     subptr(rsp, framesize);
 104 
 105     // Save RBP register now.
 106     framesize -= wordSize;
 107     movptr(Address(rsp, framesize), rbp);
 108     // Save caller's stack pointer into RBP if the frame pointer is preserved.
 109     if (PreserveFramePointer) {
 110       movptr(rbp, rsp);
 111       if (framesize > 0) {
 112         addptr(rbp, framesize);
 113       }
 114     }
 115   }
 116 
 117   if (C->needs_stack_repair()) {
 118     // Save stack increment just below the saved rbp (also account for fixed framesize and rbp)
 119     assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned");
 120     movptr(Address(rsp, framesize - wordSize), sp_inc + framesize + wordSize);
 121   }
 122 
 123   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 124     framesize -= wordSize;
 125     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 126   }
 127 
 128 #ifdef ASSERT
 129   if (VerifyStackAtCalls) {
 130     Label L;
 131     push(rax);
 132     mov(rax, rsp);
 133     andptr(rax, StackAlignmentInBytes-1);
 134     cmpptr(rax, StackAlignmentInBytes-wordSize);
 135     pop(rax);
 136     jcc(Assembler::equal, L);
 137     STOP("Stack is not properly aligned!");
 138     bind(L);
 139   }
 140 #endif
 141 }
 142 
 143 void C2_MacroAssembler::entry_barrier() {
 144   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 145   // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 146   Label dummy_slow_path;
 147   Label dummy_continuation;
 148   Label* slow_path = &dummy_slow_path;
 149   Label* continuation = &dummy_continuation;
 150   if (!Compile::current()->output()->in_scratch_emit_size()) {
 151     // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 152     C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 153     Compile::current()->output()->add_stub(stub);
 154     slow_path = &stub->entry();
 155     continuation = &stub->continuation();
 156   }
 157   bs->nmethod_entry_barrier(this, slow_path, continuation);
 158 }
 159 
 160 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 161   switch (vlen_in_bytes) {
 162     case  4: // fall-through
 163     case  8: // fall-through
 164     case 16: return Assembler::AVX_128bit;
 165     case 32: return Assembler::AVX_256bit;
 166     case 64: return Assembler::AVX_512bit;
 167 
 168     default: {
 169       ShouldNotReachHere();
 170       return Assembler::AVX_NoVec;
 171     }
 172   }
 173 }
 174 
 175 // fast_lock and fast_unlock used by C2
 176 
 177 // Because the transitions from emitted code to the runtime
 178 // monitorenter/exit helper stubs are so slow it's critical that
 179 // we inline both the stack-locking fast path and the inflated fast path.
 180 //
 181 // See also: cmpFastLock and cmpFastUnlock.
 182 //
 183 // What follows is a specialized inline transliteration of the code
 184 // in enter() and exit(). If we're concerned about I$ bloat another
 185 // option would be to emit TrySlowEnter and TrySlowExit methods
 186 // at startup-time.  These methods would accept arguments as
 187 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 188 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 189 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 190 // In practice, however, the # of lock sites is bounded and is usually small.
 191 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 192 // if the processor uses simple bimodal branch predictors keyed by EIP
 193 // Since the helper routines would be called from multiple synchronization
 194 // sites.
 195 //
 196 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 197 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 198 // to those specialized methods.  That'd give us a mostly platform-independent
 199 // implementation that the JITs could optimize and inline at their pleasure.
 200 // Done correctly, the only time we'd need to cross to native could would be
 201 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 202 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 203 // (b) explicit barriers or fence operations.
 204 //
 205 // TODO:
 206 //
 207 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 208 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 209 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 210 //    the lock operators would typically be faster than reifying Self.
 211 //
 212 // *  Ideally I'd define the primitives as:
 213 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 214 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 215 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 216 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 217 //    Furthermore the register assignments are overconstrained, possibly resulting in
 218 //    sub-optimal code near the synchronization site.
 219 //
 220 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 221 //    Alternately, use a better sp-proximity test.
 222 //
 223 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 224 //    Either one is sufficient to uniquely identify a thread.
 225 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 226 //
 227 // *  Intrinsify notify() and notifyAll() for the common cases where the
 228 //    object is locked by the calling thread but the waitlist is empty.
 229 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 230 //
 231 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 232 //    But beware of excessive branch density on AMD Opterons.
 233 //
 234 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 235 //    or failure of the fast path.  If the fast path fails then we pass
 236 //    control to the slow path, typically in C.  In fast_lock and
 237 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 238 //    will emit a conditional branch immediately after the node.
 239 //    So we have branches to branches and lots of ICC.ZF games.
 240 //    Instead, it might be better to have C2 pass a "FailureLabel"
 241 //    into fast_lock and fast_unlock.  In the case of success, control
 242 //    will drop through the node.  ICC.ZF is undefined at exit.
 243 //    In the case of failure, the node will branch directly to the
 244 //    FailureLabel
 245 
 246 
 247 // obj: object to lock
 248 // box: on-stack box address (displaced header location) - KILLED
 249 // rax,: tmp -- KILLED
 250 // scr: tmp -- KILLED
 251 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 252                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 253                                  Metadata* method_data) {
 254   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 255   // Ensure the register assignments are disjoint
 256   assert(tmpReg == rax, "");
 257   assert(cx1Reg == noreg, "");
 258   assert(cx2Reg == noreg, "");
 259   assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 260 
 261   // Possible cases that we'll encounter in fast_lock
 262   // ------------------------------------------------
 263   // * Inflated
 264   //    -- unlocked
 265   //    -- Locked
 266   //       = by self
 267   //       = by other
 268   // * neutral
 269   // * stack-locked
 270   //    -- by self
 271   //       = sp-proximity test hits
 272   //       = sp-proximity test generates false-negative
 273   //    -- by other
 274   //
 275 
 276   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 277 
 278   if (DiagnoseSyncOnValueBasedClasses != 0) {
 279     load_klass(tmpReg, objReg, scrReg);
 280     testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 281     jcc(Assembler::notZero, DONE_LABEL);
 282   }
 283 
 284   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 285   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 286   jcc(Assembler::notZero, IsInflated);
 287 
 288   if (LockingMode == LM_MONITOR) {
 289     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 290     testptr(objReg, objReg);
 291   } else {
 292     assert(LockingMode == LM_LEGACY, "must be");
 293     // Attempt stack-locking ...
 294     orptr (tmpReg, markWord::unlocked_value);
 295     if (EnableValhalla) {
 296       // Mask inline_type bit such that we go to the slow path if object is an inline type
 297       andptr(tmpReg, ~((int) markWord::inline_type_bit_in_place));
 298     }
 299     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 300     lock();
 301     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 302     jcc(Assembler::equal, COUNT);           // Success
 303 
 304     // Recursive locking.
 305     // The object is stack-locked: markword contains stack pointer to BasicLock.
 306     // Locked by current thread if difference with current SP is less than one page.
 307     subptr(tmpReg, rsp);
 308     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 309     andptr(tmpReg, (int32_t) (7 - (int)os::vm_page_size()) );
 310     movptr(Address(boxReg, 0), tmpReg);
 311   }
 312   jmp(DONE_LABEL);
 313 
 314   bind(IsInflated);
 315   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 316 
 317   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 318   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 319   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 320 
 321   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 322   movptr(boxReg, Address(r15_thread, JavaThread::monitor_owner_id_offset()));
 323   movq(scrReg, tmpReg);
 324   xorq(tmpReg, tmpReg);
 325   lock();
 326   cmpxchgptr(boxReg, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 327 
 328   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 329   jccb(Assembler::equal, COUNT);    // CAS above succeeded; propagate ZF = 1 (success)
 330 
 331   cmpptr(boxReg, rax);                // Check if we are already the owner (recursive lock)
 332   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 333   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 334   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 335   bind(DONE_LABEL);
 336 
 337   // ZFlag == 1 count in fast path
 338   // ZFlag == 0 count in slow path
 339   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 340 
 341   bind(COUNT);
 342   if (LockingMode == LM_LEGACY) {
 343     // Count monitors in fast path
 344     increment(Address(thread, JavaThread::held_monitor_count_offset()));
 345   }
 346   xorl(tmpReg, tmpReg); // Set ZF == 1
 347 
 348   bind(NO_COUNT);
 349 
 350   // At NO_COUNT the icc ZFlag is set as follows ...
 351   // fast_unlock uses the same protocol.
 352   // ZFlag == 1 -> Success
 353   // ZFlag == 0 -> Failure - force control through the slow path
 354 }
 355 
 356 // obj: object to unlock
 357 // box: box address (displaced header location), killed.  Must be EAX.
 358 // tmp: killed, cannot be obj nor box.
 359 //
 360 // Some commentary on balanced locking:
 361 //
 362 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 363 // Methods that don't have provably balanced locking are forced to run in the
 364 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 365 // The interpreter provides two properties:
 366 // I1:  At return-time the interpreter automatically and quietly unlocks any
 367 //      objects acquired the current activation (frame).  Recall that the
 368 //      interpreter maintains an on-stack list of locks currently held by
 369 //      a frame.
 370 // I2:  If a method attempts to unlock an object that is not held by the
 371 //      the frame the interpreter throws IMSX.
 372 //
 373 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 374 // B() doesn't have provably balanced locking so it runs in the interpreter.
 375 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 376 // is still locked by A().
 377 //
 378 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 379 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 380 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 381 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 382 // Arguably given that the spec legislates the JNI case as undefined our implementation
 383 // could reasonably *avoid* checking owner in fast_unlock().
 384 // In the interest of performance we elide m->Owner==Self check in unlock.
 385 // A perfectly viable alternative is to elide the owner check except when
 386 // Xcheck:jni is enabled.
 387 
 388 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) {
 389   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 390   assert(boxReg == rax, "");
 391   assert_different_registers(objReg, boxReg, tmpReg);
 392 
 393   Label DONE_LABEL, Stacked, COUNT, NO_COUNT;
 394 
 395   if (LockingMode == LM_LEGACY) {
 396     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 397     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 398   }
 399   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 400   if (LockingMode != LM_MONITOR) {
 401     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 402     jcc(Assembler::zero, Stacked);
 403   }
 404 
 405   // It's inflated.
 406 
 407   // Despite our balanced locking property we still check that m->_owner == Self
 408   // as java routines or native JNI code called by this thread might
 409   // have released the lock.
 410   //
 411   // If there's no contention try a 1-0 exit.  That is, exit without
 412   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 413   // we detect and recover from the race that the 1-0 exit admits.
 414   //
 415   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 416   // before it STs null into _owner, releasing the lock.  Updates
 417   // to data protected by the critical section must be visible before
 418   // we drop the lock (and thus before any other thread could acquire
 419   // the lock and observe the fields protected by the lock).
 420   // IA32's memory-model is SPO, so STs are ordered with respect to
 421   // each other and there's no need for an explicit barrier (fence).
 422   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 423   Label LSuccess, LNotRecursive;
 424 
 425   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 426   jccb(Assembler::equal, LNotRecursive);
 427 
 428   // Recursive inflated unlock
 429   decrement(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 430   jmpb(LSuccess);
 431 
 432   bind(LNotRecursive);
 433 
 434   // Set owner to null.
 435   // Release to satisfy the JMM
 436   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 437   // We need a full fence after clearing owner to avoid stranding.
 438   // StoreLoad achieves this.
 439   membar(StoreLoad);
 440 
 441   // Check if the entry_list is empty.
 442   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(entry_list)), NULL_WORD);
 443   jccb(Assembler::zero, LSuccess);    // If so we are done.
 444 
 445   // Check if there is a successor.
 446   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 447   jccb(Assembler::notZero, LSuccess); // If so we are done.
 448 
 449   // Save the monitor pointer in the current thread, so we can try to
 450   // reacquire the lock in SharedRuntime::monitor_exit_helper().
 451   andptr(tmpReg, ~(int32_t)markWord::monitor_value);
 452   movptr(Address(r15_thread, JavaThread::unlocked_inflated_monitor_offset()), tmpReg);
 453 
 454   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 455   jmpb  (DONE_LABEL);
 456 
 457   bind  (LSuccess);
 458   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 459   jmpb  (DONE_LABEL);
 460 
 461   if (LockingMode == LM_LEGACY) {
 462     bind  (Stacked);
 463     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 464     lock();
 465     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 466     // Intentional fall-thru into DONE_LABEL
 467   }
 468 
 469   bind(DONE_LABEL);
 470 
 471   // ZFlag == 1 count in fast path
 472   // ZFlag == 0 count in slow path
 473   jccb(Assembler::notZero, NO_COUNT);
 474 
 475   bind(COUNT);
 476 
 477   if (LockingMode == LM_LEGACY) {
 478     // Count monitors in fast path
 479     decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 480   }
 481 
 482   xorl(tmpReg, tmpReg); // Set ZF == 1
 483 
 484   bind(NO_COUNT);
 485 }
 486 
 487 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
 488                                               Register t, Register thread) {
 489   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 490   assert(rax_reg == rax, "Used for CAS");
 491   assert_different_registers(obj, box, rax_reg, t, thread);
 492 
 493   // Handle inflated monitor.
 494   Label inflated;
 495   // Finish fast lock successfully. ZF value is irrelevant.
 496   Label locked;
 497   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 498   Label slow_path;
 499 
 500   if (UseObjectMonitorTable) {
 501     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 502     movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
 503   }
 504 
 505   if (DiagnoseSyncOnValueBasedClasses != 0) {
 506     load_klass(rax_reg, obj, t);
 507     testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 508     jcc(Assembler::notZero, slow_path);
 509   }
 510 
 511   const Register mark = t;
 512 
 513   { // Lightweight Lock
 514 
 515     Label push;
 516 
 517     const Register top = UseObjectMonitorTable ? rax_reg : box;
 518 
 519     // Load the mark.
 520     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 521 
 522     // Prefetch top.
 523     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 524 
 525     // Check for monitor (0b10).
 526     testptr(mark, markWord::monitor_value);
 527     jcc(Assembler::notZero, inflated);
 528 
 529     // Check if lock-stack is full.
 530     cmpl(top, LockStack::end_offset() - 1);
 531     jcc(Assembler::greater, slow_path);
 532 
 533     // Check if recursive.
 534     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 535     jccb(Assembler::equal, push);
 536 
 537     // Try to lock. Transition lock bits 0b01 => 0b00
 538     movptr(rax_reg, mark);
 539     orptr(rax_reg, markWord::unlocked_value);
 540     andptr(mark, ~(int32_t)markWord::unlocked_value);
 541     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 542     jcc(Assembler::notEqual, slow_path);
 543 
 544     if (UseObjectMonitorTable) {
 545       // Need to reload top, clobbered by CAS.
 546       movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 547     }
 548     bind(push);
 549     // After successful lock, push object on lock-stack.
 550     movptr(Address(thread, top), obj);
 551     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 552     jmpb(locked);
 553   }
 554 
 555   { // Handle inflated monitor.
 556     bind(inflated);
 557 
 558     const Register monitor = t;
 559 
 560     if (!UseObjectMonitorTable) {
 561       assert(mark == monitor, "should be the same here");
 562     } else {
 563       // Uses ObjectMonitorTable.  Look for the monitor in the om_cache.
 564       // Fetch ObjectMonitor* from the cache or take the slow-path.
 565       Label monitor_found;
 566 
 567       // Load cache address
 568       lea(t, Address(thread, JavaThread::om_cache_oops_offset()));
 569 
 570       const int num_unrolled = 2;
 571       for (int i = 0; i < num_unrolled; i++) {
 572         cmpptr(obj, Address(t));
 573         jccb(Assembler::equal, monitor_found);
 574         increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 575       }
 576 
 577       Label loop;
 578 
 579       // Search for obj in cache.
 580       bind(loop);
 581 
 582       // Check for match.
 583       cmpptr(obj, Address(t));
 584       jccb(Assembler::equal, monitor_found);
 585 
 586       // Search until null encountered, guaranteed _null_sentinel at end.
 587       cmpptr(Address(t), 1);
 588       jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0
 589       increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 590       jmpb(loop);
 591 
 592       // Cache hit.
 593       bind(monitor_found);
 594       movptr(monitor, Address(t, OMCache::oop_to_monitor_difference()));
 595     }
 596     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 597     const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 598     const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
 599 
 600     Label monitor_locked;
 601     // Lock the monitor.
 602 
 603     if (UseObjectMonitorTable) {
 604       // Cache the monitor for unlock before trashing box. On failure to acquire
 605       // the lock, the slow path will reset the entry accordingly (see CacheSetter).
 606       movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
 607     }
 608 
 609     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 610     xorptr(rax_reg, rax_reg);
 611     movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
 612     lock(); cmpxchgptr(box, owner_address);
 613     jccb(Assembler::equal, monitor_locked);
 614 
 615     // Check if recursive.
 616     cmpptr(box, rax_reg);
 617     jccb(Assembler::notEqual, slow_path);
 618 
 619     // Recursive.
 620     increment(recursions_address);
 621 
 622     bind(monitor_locked);
 623   }
 624 
 625   bind(locked);
 626   // Set ZF = 1
 627   xorl(rax_reg, rax_reg);
 628 
 629 #ifdef ASSERT
 630   // Check that locked label is reached with ZF set.
 631   Label zf_correct;
 632   Label zf_bad_zero;
 633   jcc(Assembler::zero, zf_correct);
 634   jmp(zf_bad_zero);
 635 #endif
 636 
 637   bind(slow_path);
 638 #ifdef ASSERT
 639   // Check that slow_path label is reached with ZF not set.
 640   jcc(Assembler::notZero, zf_correct);
 641   stop("Fast Lock ZF != 0");
 642   bind(zf_bad_zero);
 643   stop("Fast Lock ZF != 1");
 644   bind(zf_correct);
 645 #endif
 646   // C2 uses the value of ZF to determine the continuation.
 647 }
 648 
 649 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) {
 650   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 651   assert(reg_rax == rax, "Used for CAS");
 652   assert_different_registers(obj, reg_rax, t);
 653 
 654   // Handle inflated monitor.
 655   Label inflated, inflated_check_lock_stack;
 656   // Finish fast unlock successfully.  MUST jump with ZF == 1
 657   Label unlocked, slow_path;
 658 
 659   const Register mark = t;
 660   const Register monitor = t;
 661   const Register top = UseObjectMonitorTable ? t : reg_rax;
 662   const Register box = reg_rax;
 663 
 664   Label dummy;
 665   C2FastUnlockLightweightStub* stub = nullptr;
 666 
 667   if (!Compile::current()->output()->in_scratch_emit_size()) {
 668     stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread);
 669     Compile::current()->output()->add_stub(stub);
 670   }
 671 
 672   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 673 
 674   { // Lightweight Unlock
 675 
 676     // Load top.
 677     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 678 
 679     if (!UseObjectMonitorTable) {
 680       // Prefetch mark.
 681       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 682     }
 683 
 684     // Check if obj is top of lock-stack.
 685     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 686     // Top of lock stack was not obj. Must be monitor.
 687     jcc(Assembler::notEqual, inflated_check_lock_stack);
 688 
 689     // Pop lock-stack.
 690     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 691     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 692 
 693     // Check if recursive.
 694     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 695     jcc(Assembler::equal, unlocked);
 696 
 697     // We elide the monitor check, let the CAS fail instead.
 698 
 699     if (UseObjectMonitorTable) {
 700       // Load mark.
 701       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 702     }
 703 
 704     // Try to unlock. Transition lock bits 0b00 => 0b01
 705     movptr(reg_rax, mark);
 706     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
 707     orptr(mark, markWord::unlocked_value);
 708     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 709     jcc(Assembler::notEqual, push_and_slow_path);
 710     jmp(unlocked);
 711   }
 712 
 713 
 714   { // Handle inflated monitor.
 715     bind(inflated_check_lock_stack);
 716 #ifdef ASSERT
 717     Label check_done;
 718     subl(top, oopSize);
 719     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
 720     jcc(Assembler::below, check_done);
 721     cmpptr(obj, Address(thread, top));
 722     jccb(Assembler::notEqual, inflated_check_lock_stack);
 723     stop("Fast Unlock lock on stack");
 724     bind(check_done);
 725     if (UseObjectMonitorTable) {
 726       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 727     }
 728     testptr(mark, markWord::monitor_value);
 729     jccb(Assembler::notZero, inflated);
 730     stop("Fast Unlock not monitor");
 731 #endif
 732 
 733     bind(inflated);
 734 
 735     if (!UseObjectMonitorTable) {
 736       assert(mark == monitor, "should be the same here");
 737     } else {
 738       // Uses ObjectMonitorTable.  Look for the monitor in our BasicLock on the stack.
 739       movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 740       // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
 741       cmpptr(monitor, alignof(ObjectMonitor*));
 742       jcc(Assembler::below, slow_path);
 743     }
 744     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 745     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
 746     const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
 747     const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag};
 748     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
 749 
 750     Label recursive;
 751 
 752     // Check if recursive.
 753     cmpptr(recursions_address, 0);
 754     jccb(Assembler::notZero, recursive);
 755 
 756     // Set owner to null.
 757     // Release to satisfy the JMM
 758     movptr(owner_address, NULL_WORD);
 759     // We need a full fence after clearing owner to avoid stranding.
 760     // StoreLoad achieves this.
 761     membar(StoreLoad);
 762 
 763     // Check if the entry_list is empty.
 764     cmpptr(entry_list_address, NULL_WORD);
 765     jccb(Assembler::zero, unlocked);    // If so we are done.
 766 
 767     // Check if there is a successor.
 768     cmpptr(succ_address, NULL_WORD);
 769     jccb(Assembler::notZero, unlocked); // If so we are done.
 770 
 771     // Save the monitor pointer in the current thread, so we can try to
 772     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 773     if (!UseObjectMonitorTable) {
 774       andptr(monitor, ~(int32_t)markWord::monitor_value);
 775     }
 776     movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
 777 
 778     orl(t, 1); // Fast Unlock ZF = 0
 779     jmpb(slow_path);
 780 
 781     // Recursive unlock.
 782     bind(recursive);
 783     decrement(recursions_address);
 784   }
 785 
 786   bind(unlocked);
 787   xorl(t, t); // Fast Unlock ZF = 1
 788 
 789 #ifdef ASSERT
 790   // Check that unlocked label is reached with ZF set.
 791   Label zf_correct;
 792   Label zf_bad_zero;
 793   jcc(Assembler::zero, zf_correct);
 794   jmp(zf_bad_zero);
 795 #endif
 796 
 797   bind(slow_path);
 798   if (stub != nullptr) {
 799     bind(stub->slow_path_continuation());
 800   }
 801 #ifdef ASSERT
 802   // Check that stub->continuation() label is reached with ZF not set.
 803   jcc(Assembler::notZero, zf_correct);
 804   stop("Fast Unlock ZF != 0");
 805   bind(zf_bad_zero);
 806   stop("Fast Unlock ZF != 1");
 807   bind(zf_correct);
 808 #endif
 809   // C2 uses the value of ZF to determine the continuation.
 810 }
 811 
 812 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
 813   fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
 814 }
 815 
 816 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) {
 817   const int framesize = Compile::current()->output()->frame_size_in_bytes();
 818   masm->movptr(dst, rsp);
 819   if (framesize > 2 * wordSize) {
 820     masm->addptr(dst, framesize - 2 * wordSize);
 821   }
 822 }
 823 
 824 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
 825   if (PreserveFramePointer) {
 826     // frame pointer is valid
 827 #ifdef ASSERT
 828     // Verify frame pointer value in rbp.
 829     reconstruct_frame_pointer_helper(this, rtmp);
 830     Label L_success;
 831     cmpq(rbp, rtmp);
 832     jccb(Assembler::equal, L_success);
 833     STOP("frame pointer mismatch");
 834     bind(L_success);
 835 #endif // ASSERT
 836   } else {
 837     reconstruct_frame_pointer_helper(this, rbp);
 838   }
 839 }
 840 
 841 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) {
 842   jint lo = t->_lo;
 843   jint hi = t->_hi;
 844   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi);
 845   if (t == TypeInt::INT) {
 846     return;
 847   }
 848 
 849   BLOCK_COMMENT("CastII {");
 850   Label fail;
 851   Label succeed;
 852   if (hi == max_jint) {
 853     cmpl(val, lo);
 854     jccb(Assembler::greaterEqual, succeed);
 855   } else {
 856     if (lo != min_jint) {
 857       cmpl(val, lo);
 858       jccb(Assembler::less, fail);
 859     }
 860     cmpl(val, hi);
 861     jccb(Assembler::lessEqual, succeed);
 862   }
 863 
 864   bind(fail);
 865   movl(c_rarg0, idx);
 866   movl(c_rarg1, val);
 867   movl(c_rarg2, lo);
 868   movl(c_rarg3, hi);
 869   reconstruct_frame_pointer(rscratch1);
 870   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range)));
 871   hlt();
 872   bind(succeed);
 873   BLOCK_COMMENT("} // CastII");
 874 }
 875 
 876 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
 877   fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
 878 }
 879 
 880 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) {
 881   jlong lo = t->_lo;
 882   jlong hi = t->_hi;
 883   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi);
 884   if (t == TypeLong::LONG) {
 885     return;
 886   }
 887 
 888   BLOCK_COMMENT("CastLL {");
 889   Label fail;
 890   Label succeed;
 891 
 892   auto cmp_val = [&](jlong bound) {
 893     if (is_simm32(bound)) {
 894       cmpq(val, checked_cast<int>(bound));
 895     } else {
 896       mov64(tmp, bound);
 897       cmpq(val, tmp);
 898     }
 899   };
 900 
 901   if (hi == max_jlong) {
 902     cmp_val(lo);
 903     jccb(Assembler::greaterEqual, succeed);
 904   } else {
 905     if (lo != min_jlong) {
 906       cmp_val(lo);
 907       jccb(Assembler::less, fail);
 908     }
 909     cmp_val(hi);
 910     jccb(Assembler::lessEqual, succeed);
 911   }
 912 
 913   bind(fail);
 914   movl(c_rarg0, idx);
 915   movq(c_rarg1, val);
 916   mov64(c_rarg2, lo);
 917   mov64(c_rarg3, hi);
 918   reconstruct_frame_pointer(rscratch1);
 919   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range)));
 920   hlt();
 921   bind(succeed);
 922   BLOCK_COMMENT("} // CastLL");
 923 }
 924 
 925 //-------------------------------------------------------------------------------------------
 926 // Generic instructions support for use in .ad files C2 code generation
 927 
 928 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 929   if (dst != src) {
 930     movdqu(dst, src);
 931   }
 932   if (opcode == Op_AbsVD) {
 933     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 934   } else {
 935     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 936     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 937   }
 938 }
 939 
 940 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 941   if (opcode == Op_AbsVD) {
 942     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 943   } else {
 944     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 945     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 946   }
 947 }
 948 
 949 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 950   if (dst != src) {
 951     movdqu(dst, src);
 952   }
 953   if (opcode == Op_AbsVF) {
 954     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 955   } else {
 956     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 957     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 958   }
 959 }
 960 
 961 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 962   if (opcode == Op_AbsVF) {
 963     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 964   } else {
 965     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 966     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 967   }
 968 }
 969 
 970 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 971   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 972   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 973 
 974   if (opcode == Op_MinV) {
 975     if (elem_bt == T_BYTE) {
 976       pminsb(dst, src);
 977     } else if (elem_bt == T_SHORT) {
 978       pminsw(dst, src);
 979     } else if (elem_bt == T_INT) {
 980       pminsd(dst, src);
 981     } else {
 982       assert(elem_bt == T_LONG, "required");
 983       assert(tmp == xmm0, "required");
 984       assert_different_registers(dst, src, tmp);
 985       movdqu(xmm0, dst);
 986       pcmpgtq(xmm0, src);
 987       blendvpd(dst, src);  // xmm0 as mask
 988     }
 989   } else { // opcode == Op_MaxV
 990     if (elem_bt == T_BYTE) {
 991       pmaxsb(dst, src);
 992     } else if (elem_bt == T_SHORT) {
 993       pmaxsw(dst, src);
 994     } else if (elem_bt == T_INT) {
 995       pmaxsd(dst, src);
 996     } else {
 997       assert(elem_bt == T_LONG, "required");
 998       assert(tmp == xmm0, "required");
 999       assert_different_registers(dst, src, tmp);
1000       movdqu(xmm0, src);
1001       pcmpgtq(xmm0, dst);
1002       blendvpd(dst, src);  // xmm0 as mask
1003     }
1004   }
1005 }
1006 
1007 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
1008                                   XMMRegister src1, Address src2, int vlen_enc) {
1009   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
1010   if (opcode == Op_UMinV) {
1011     switch(elem_bt) {
1012       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
1013       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
1014       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
1015       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
1016       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
1017     }
1018   } else {
1019     assert(opcode == Op_UMaxV, "required");
1020     switch(elem_bt) {
1021       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
1022       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
1023       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
1024       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
1025       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
1026     }
1027   }
1028 }
1029 
1030 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
1031   // For optimality, leverage a full vector width of 512 bits
1032   // for operations over smaller vector sizes on AVX512 targets.
1033   if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
1034     if (opcode == Op_UMaxV) {
1035       evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
1036     } else {
1037       assert(opcode == Op_UMinV, "required");
1038       evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
1039     }
1040   } else {
1041     // T1 = -1
1042     vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
1043     // T1 = -1 << 63
1044     vpsllq(xtmp1, xtmp1, 63, vlen_enc);
1045     // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
1046     vpaddq(xtmp2, xtmp1, src2, vlen_enc);
1047     // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
1048     vpaddq(xtmp1, xtmp1, src1, vlen_enc);
1049     // Mask = T2 > T1
1050     vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
1051     if (opcode == Op_UMaxV) {
1052       // Res = Mask ? Src2 : Src1
1053       vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
1054     } else {
1055       // Res = Mask ? Src1 : Src2
1056       vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
1057     }
1058   }
1059 }
1060 
1061 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
1062                                   XMMRegister src1, XMMRegister src2, int vlen_enc) {
1063   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
1064   if (opcode == Op_UMinV) {
1065     switch(elem_bt) {
1066       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
1067       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
1068       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
1069       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
1070       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
1071     }
1072   } else {
1073     assert(opcode == Op_UMaxV, "required");
1074     switch(elem_bt) {
1075       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
1076       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
1077       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
1078       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
1079       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
1080     }
1081   }
1082 }
1083 
1084 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
1085                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
1086                                  int vlen_enc) {
1087   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1088 
1089   if (opcode == Op_MinV) {
1090     if (elem_bt == T_BYTE) {
1091       vpminsb(dst, src1, src2, vlen_enc);
1092     } else if (elem_bt == T_SHORT) {
1093       vpminsw(dst, src1, src2, vlen_enc);
1094     } else if (elem_bt == T_INT) {
1095       vpminsd(dst, src1, src2, vlen_enc);
1096     } else {
1097       assert(elem_bt == T_LONG, "required");
1098       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1099         vpminsq(dst, src1, src2, vlen_enc);
1100       } else {
1101         assert_different_registers(dst, src1, src2);
1102         vpcmpgtq(dst, src1, src2, vlen_enc);
1103         vblendvpd(dst, src1, src2, dst, vlen_enc);
1104       }
1105     }
1106   } else { // opcode == Op_MaxV
1107     if (elem_bt == T_BYTE) {
1108       vpmaxsb(dst, src1, src2, vlen_enc);
1109     } else if (elem_bt == T_SHORT) {
1110       vpmaxsw(dst, src1, src2, vlen_enc);
1111     } else if (elem_bt == T_INT) {
1112       vpmaxsd(dst, src1, src2, vlen_enc);
1113     } else {
1114       assert(elem_bt == T_LONG, "required");
1115       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1116         vpmaxsq(dst, src1, src2, vlen_enc);
1117       } else {
1118         assert_different_registers(dst, src1, src2);
1119         vpcmpgtq(dst, src1, src2, vlen_enc);
1120         vblendvpd(dst, src2, src1, dst, vlen_enc);
1121       }
1122     }
1123   }
1124 }
1125 
1126 // Float/Double min max
1127 
1128 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1129                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1130                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1131                                    int vlen_enc) {
1132   assert(UseAVX > 0, "required");
1133   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1134          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1135   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1136   assert_different_registers(a, tmp, atmp, btmp);
1137   assert_different_registers(b, tmp, atmp, btmp);
1138 
1139   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1140   bool is_double_word = is_double_word_type(elem_bt);
1141 
1142   /* Note on 'non-obvious' assembly sequence:
1143    *
1144    * While there are vminps/vmaxps instructions, there are two important differences between hardware
1145    * and Java on how they handle floats:
1146    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
1147    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
1148    *
1149    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
1150    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
1151    *                (only useful when signs differ, noop otherwise)
1152    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
1153 
1154    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
1155    *   btmp = (b < +0.0) ? a : b
1156    *   atmp = (b < +0.0) ? b : a
1157    *   Tmp  = Max_Float(atmp , btmp)
1158    *   Res  = (atmp == NaN) ? atmp : Tmp
1159    */
1160 
1161   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
1162   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
1163   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
1164   XMMRegister mask;
1165 
1166   if (!is_double_word && is_min) {
1167     mask = a;
1168     vblend = &MacroAssembler::vblendvps;
1169     vmaxmin = &MacroAssembler::vminps;
1170     vcmp = &MacroAssembler::vcmpps;
1171   } else if (!is_double_word && !is_min) {
1172     mask = b;
1173     vblend = &MacroAssembler::vblendvps;
1174     vmaxmin = &MacroAssembler::vmaxps;
1175     vcmp = &MacroAssembler::vcmpps;
1176   } else if (is_double_word && is_min) {
1177     mask = a;
1178     vblend = &MacroAssembler::vblendvpd;
1179     vmaxmin = &MacroAssembler::vminpd;
1180     vcmp = &MacroAssembler::vcmppd;
1181   } else {
1182     assert(is_double_word && !is_min, "sanity");
1183     mask = b;
1184     vblend = &MacroAssembler::vblendvpd;
1185     vmaxmin = &MacroAssembler::vmaxpd;
1186     vcmp = &MacroAssembler::vcmppd;
1187   }
1188 
1189   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
1190   XMMRegister maxmin, scratch;
1191   if (dst == btmp) {
1192     maxmin = btmp;
1193     scratch = tmp;
1194   } else {
1195     maxmin = tmp;
1196     scratch = btmp;
1197   }
1198 
1199   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1200   if (precompute_mask && !is_double_word) {
1201     vpsrad(tmp, mask, 32, vlen_enc);
1202     mask = tmp;
1203   } else if (precompute_mask && is_double_word) {
1204     vpxor(tmp, tmp, tmp, vlen_enc);
1205     vpcmpgtq(tmp, tmp, mask, vlen_enc);
1206     mask = tmp;
1207   }
1208 
1209   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1210   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1211   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1212   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1213   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1214 }
1215 
1216 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1217                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1218                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1219                                     int vlen_enc) {
1220   assert(UseAVX > 2, "required");
1221   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1222          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1223   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1224   assert_different_registers(dst, a, atmp, btmp);
1225   assert_different_registers(dst, b, atmp, btmp);
1226 
1227   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1228   bool is_double_word = is_double_word_type(elem_bt);
1229   bool merge = true;
1230 
1231   if (!is_double_word && is_min) {
1232     evpmovd2m(ktmp, a, vlen_enc);
1233     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1234     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1235     vminps(dst, atmp, btmp, vlen_enc);
1236     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1237     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1238   } else if (!is_double_word && !is_min) {
1239     evpmovd2m(ktmp, b, vlen_enc);
1240     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1241     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1242     vmaxps(dst, atmp, btmp, vlen_enc);
1243     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1244     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1245   } else if (is_double_word && is_min) {
1246     evpmovq2m(ktmp, a, vlen_enc);
1247     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1248     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1249     vminpd(dst, atmp, btmp, vlen_enc);
1250     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1251     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1252   } else {
1253     assert(is_double_word && !is_min, "sanity");
1254     evpmovq2m(ktmp, b, vlen_enc);
1255     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1256     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1257     vmaxpd(dst, atmp, btmp, vlen_enc);
1258     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1259     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1260   }
1261 }
1262 
1263 void C2_MacroAssembler::vminmax_fp(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1264                                    XMMRegister src1, XMMRegister src2, int vlen_enc) {
1265   assert(opc == Op_MinV || opc == Op_MinReductionV ||
1266          opc == Op_MaxV || opc == Op_MaxReductionV, "sanity");
1267 
1268   int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_MINMAX_MIN_COMPARE_SIGN
1269                                                          : AVX10_MINMAX_MAX_COMPARE_SIGN;
1270   if (elem_bt == T_FLOAT) {
1271     evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc);
1272   } else {
1273     assert(elem_bt == T_DOUBLE, "");
1274     evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc);
1275   }
1276 }
1277 
1278 // Float/Double signum
1279 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1280   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1281 
1282   Label DONE_LABEL;
1283 
1284   if (opcode == Op_SignumF) {
1285     ucomiss(dst, zero);
1286     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1287     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1288     movflt(dst, one);
1289     jcc(Assembler::above, DONE_LABEL);
1290     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1291   } else if (opcode == Op_SignumD) {
1292     ucomisd(dst, zero);
1293     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1294     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1295     movdbl(dst, one);
1296     jcc(Assembler::above, DONE_LABEL);
1297     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1298   }
1299 
1300   bind(DONE_LABEL);
1301 }
1302 
1303 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1304   if (sign) {
1305     pmovsxbw(dst, src);
1306   } else {
1307     pmovzxbw(dst, src);
1308   }
1309 }
1310 
1311 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1312   if (sign) {
1313     vpmovsxbw(dst, src, vector_len);
1314   } else {
1315     vpmovzxbw(dst, src, vector_len);
1316   }
1317 }
1318 
1319 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1320   if (sign) {
1321     vpmovsxbd(dst, src, vector_len);
1322   } else {
1323     vpmovzxbd(dst, src, vector_len);
1324   }
1325 }
1326 
1327 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1328   if (sign) {
1329     vpmovsxwd(dst, src, vector_len);
1330   } else {
1331     vpmovzxwd(dst, src, vector_len);
1332   }
1333 }
1334 
1335 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1336                                      int shift, int vector_len) {
1337   if (opcode == Op_RotateLeftV) {
1338     if (etype == T_INT) {
1339       evprold(dst, src, shift, vector_len);
1340     } else {
1341       assert(etype == T_LONG, "expected type T_LONG");
1342       evprolq(dst, src, shift, vector_len);
1343     }
1344   } else {
1345     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1346     if (etype == T_INT) {
1347       evprord(dst, src, shift, vector_len);
1348     } else {
1349       assert(etype == T_LONG, "expected type T_LONG");
1350       evprorq(dst, src, shift, vector_len);
1351     }
1352   }
1353 }
1354 
1355 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1356                                      XMMRegister shift, int vector_len) {
1357   if (opcode == Op_RotateLeftV) {
1358     if (etype == T_INT) {
1359       evprolvd(dst, src, shift, vector_len);
1360     } else {
1361       assert(etype == T_LONG, "expected type T_LONG");
1362       evprolvq(dst, src, shift, vector_len);
1363     }
1364   } else {
1365     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1366     if (etype == T_INT) {
1367       evprorvd(dst, src, shift, vector_len);
1368     } else {
1369       assert(etype == T_LONG, "expected type T_LONG");
1370       evprorvq(dst, src, shift, vector_len);
1371     }
1372   }
1373 }
1374 
1375 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1376   if (opcode == Op_RShiftVI) {
1377     psrad(dst, shift);
1378   } else if (opcode == Op_LShiftVI) {
1379     pslld(dst, shift);
1380   } else {
1381     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1382     psrld(dst, shift);
1383   }
1384 }
1385 
1386 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1387   switch (opcode) {
1388     case Op_RShiftVI:  psrad(dst, shift); break;
1389     case Op_LShiftVI:  pslld(dst, shift); break;
1390     case Op_URShiftVI: psrld(dst, shift); break;
1391 
1392     default: assert(false, "%s", NodeClassNames[opcode]);
1393   }
1394 }
1395 
1396 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1397   if (opcode == Op_RShiftVI) {
1398     vpsrad(dst, nds, shift, vector_len);
1399   } else if (opcode == Op_LShiftVI) {
1400     vpslld(dst, nds, shift, vector_len);
1401   } else {
1402     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1403     vpsrld(dst, nds, shift, vector_len);
1404   }
1405 }
1406 
1407 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1408   switch (opcode) {
1409     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1410     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1411     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1412 
1413     default: assert(false, "%s", NodeClassNames[opcode]);
1414   }
1415 }
1416 
1417 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1418   switch (opcode) {
1419     case Op_RShiftVB:  // fall-through
1420     case Op_RShiftVS:  psraw(dst, shift); break;
1421 
1422     case Op_LShiftVB:  // fall-through
1423     case Op_LShiftVS:  psllw(dst, shift);   break;
1424 
1425     case Op_URShiftVS: // fall-through
1426     case Op_URShiftVB: psrlw(dst, shift);  break;
1427 
1428     default: assert(false, "%s", NodeClassNames[opcode]);
1429   }
1430 }
1431 
1432 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1433   switch (opcode) {
1434     case Op_RShiftVB:  // fall-through
1435     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1436 
1437     case Op_LShiftVB:  // fall-through
1438     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1439 
1440     case Op_URShiftVS: // fall-through
1441     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1442 
1443     default: assert(false, "%s", NodeClassNames[opcode]);
1444   }
1445 }
1446 
1447 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1448   switch (opcode) {
1449     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1450     case Op_LShiftVL:  psllq(dst, shift); break;
1451     case Op_URShiftVL: psrlq(dst, shift); break;
1452 
1453     default: assert(false, "%s", NodeClassNames[opcode]);
1454   }
1455 }
1456 
1457 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1458   if (opcode == Op_RShiftVL) {
1459     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1460   } else if (opcode == Op_LShiftVL) {
1461     psllq(dst, shift);
1462   } else {
1463     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1464     psrlq(dst, shift);
1465   }
1466 }
1467 
1468 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1469   switch (opcode) {
1470     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1471     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1472     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1473 
1474     default: assert(false, "%s", NodeClassNames[opcode]);
1475   }
1476 }
1477 
1478 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1479   if (opcode == Op_RShiftVL) {
1480     evpsraq(dst, nds, shift, vector_len);
1481   } else if (opcode == Op_LShiftVL) {
1482     vpsllq(dst, nds, shift, vector_len);
1483   } else {
1484     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1485     vpsrlq(dst, nds, shift, vector_len);
1486   }
1487 }
1488 
1489 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1490   switch (opcode) {
1491     case Op_RShiftVB:  // fall-through
1492     case Op_RShiftVS:  // fall-through
1493     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1494 
1495     case Op_LShiftVB:  // fall-through
1496     case Op_LShiftVS:  // fall-through
1497     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1498 
1499     case Op_URShiftVB: // fall-through
1500     case Op_URShiftVS: // fall-through
1501     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1502 
1503     default: assert(false, "%s", NodeClassNames[opcode]);
1504   }
1505 }
1506 
1507 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1508   switch (opcode) {
1509     case Op_RShiftVB:  // fall-through
1510     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1511 
1512     case Op_LShiftVB:  // fall-through
1513     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1514 
1515     case Op_URShiftVB: // fall-through
1516     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1517 
1518     default: assert(false, "%s", NodeClassNames[opcode]);
1519   }
1520 }
1521 
1522 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1523   assert(UseAVX >= 2, "required");
1524   switch (opcode) {
1525     case Op_RShiftVL: {
1526       if (UseAVX > 2) {
1527         assert(tmp == xnoreg, "not used");
1528         if (!VM_Version::supports_avx512vl()) {
1529           vlen_enc = Assembler::AVX_512bit;
1530         }
1531         evpsravq(dst, src, shift, vlen_enc);
1532       } else {
1533         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1534         vpsrlvq(dst, src, shift, vlen_enc);
1535         vpsrlvq(tmp, tmp, shift, vlen_enc);
1536         vpxor(dst, dst, tmp, vlen_enc);
1537         vpsubq(dst, dst, tmp, vlen_enc);
1538       }
1539       break;
1540     }
1541     case Op_LShiftVL: {
1542       assert(tmp == xnoreg, "not used");
1543       vpsllvq(dst, src, shift, vlen_enc);
1544       break;
1545     }
1546     case Op_URShiftVL: {
1547       assert(tmp == xnoreg, "not used");
1548       vpsrlvq(dst, src, shift, vlen_enc);
1549       break;
1550     }
1551     default: assert(false, "%s", NodeClassNames[opcode]);
1552   }
1553 }
1554 
1555 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1556 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1557   assert(opcode == Op_LShiftVB ||
1558          opcode == Op_RShiftVB ||
1559          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1560   bool sign = (opcode != Op_URShiftVB);
1561   assert(vector_len == 0, "required");
1562   vextendbd(sign, dst, src, 1);
1563   vpmovzxbd(vtmp, shift, 1);
1564   varshiftd(opcode, dst, dst, vtmp, 1);
1565   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1566   vextracti128_high(vtmp, dst);
1567   vpackusdw(dst, dst, vtmp, 0);
1568 }
1569 
1570 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1571 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1572   assert(opcode == Op_LShiftVB ||
1573          opcode == Op_RShiftVB ||
1574          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1575   bool sign = (opcode != Op_URShiftVB);
1576   int ext_vector_len = vector_len + 1;
1577   vextendbw(sign, dst, src, ext_vector_len);
1578   vpmovzxbw(vtmp, shift, ext_vector_len);
1579   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1580   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1581   if (vector_len == 0) {
1582     vextracti128_high(vtmp, dst);
1583     vpackuswb(dst, dst, vtmp, vector_len);
1584   } else {
1585     vextracti64x4_high(vtmp, dst);
1586     vpackuswb(dst, dst, vtmp, vector_len);
1587     vpermq(dst, dst, 0xD8, vector_len);
1588   }
1589 }
1590 
1591 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1592   switch(typ) {
1593     case T_BYTE:
1594       pinsrb(dst, val, idx);
1595       break;
1596     case T_SHORT:
1597       pinsrw(dst, val, idx);
1598       break;
1599     case T_INT:
1600       pinsrd(dst, val, idx);
1601       break;
1602     case T_LONG:
1603       pinsrq(dst, val, idx);
1604       break;
1605     default:
1606       assert(false,"Should not reach here.");
1607       break;
1608   }
1609 }
1610 
1611 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1612   switch(typ) {
1613     case T_BYTE:
1614       vpinsrb(dst, src, val, idx);
1615       break;
1616     case T_SHORT:
1617       vpinsrw(dst, src, val, idx);
1618       break;
1619     case T_INT:
1620       vpinsrd(dst, src, val, idx);
1621       break;
1622     case T_LONG:
1623       vpinsrq(dst, src, val, idx);
1624       break;
1625     default:
1626       assert(false,"Should not reach here.");
1627       break;
1628   }
1629 }
1630 
1631 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt,
1632                                                 XMMRegister dst, Register base,
1633                                                 Register idx_base,
1634                                                 Register offset, Register mask,
1635                                                 Register mask_idx, Register rtmp,
1636                                                 int vlen_enc) {
1637   vpxor(dst, dst, dst, vlen_enc);
1638   if (elem_bt == T_SHORT) {
1639     for (int i = 0; i < 4; i++) {
1640       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1641       Label skip_load;
1642       btq(mask, mask_idx);
1643       jccb(Assembler::carryClear, skip_load);
1644       movl(rtmp, Address(idx_base, i * 4));
1645       if (offset != noreg) {
1646         addl(rtmp, offset);
1647       }
1648       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1649       bind(skip_load);
1650       incq(mask_idx);
1651     }
1652   } else {
1653     assert(elem_bt == T_BYTE, "");
1654     for (int i = 0; i < 8; i++) {
1655       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1656       Label skip_load;
1657       btq(mask, mask_idx);
1658       jccb(Assembler::carryClear, skip_load);
1659       movl(rtmp, Address(idx_base, i * 4));
1660       if (offset != noreg) {
1661         addl(rtmp, offset);
1662       }
1663       pinsrb(dst, Address(base, rtmp), i);
1664       bind(skip_load);
1665       incq(mask_idx);
1666     }
1667   }
1668 }
1669 
1670 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst,
1671                                          Register base, Register idx_base,
1672                                          Register offset, Register rtmp,
1673                                          int vlen_enc) {
1674   vpxor(dst, dst, dst, vlen_enc);
1675   if (elem_bt == T_SHORT) {
1676     for (int i = 0; i < 4; i++) {
1677       // dst[i] = src[offset + idx_base[i]]
1678       movl(rtmp, Address(idx_base, i * 4));
1679       if (offset != noreg) {
1680         addl(rtmp, offset);
1681       }
1682       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1683     }
1684   } else {
1685     assert(elem_bt == T_BYTE, "");
1686     for (int i = 0; i < 8; i++) {
1687       // dst[i] = src[offset + idx_base[i]]
1688       movl(rtmp, Address(idx_base, i * 4));
1689       if (offset != noreg) {
1690         addl(rtmp, offset);
1691       }
1692       pinsrb(dst, Address(base, rtmp), i);
1693     }
1694   }
1695 }
1696 
1697 /*
1698  * Gather using hybrid algorithm, first partially unroll scalar loop
1699  * to accumulate values from gather indices into a quad-word(64bit) slice.
1700  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1701  * permutation to place the slice into appropriate vector lane
1702  * locations in destination vector. Following pseudo code describes the
1703  * algorithm in detail:
1704  *
1705  * DST_VEC = ZERO_VEC
1706  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1707  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1708  * FOREACH_ITER:
1709  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1710  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1711  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1712  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1713  *
1714  * With each iteration, doubleword permute indices (0,1) corresponding
1715  * to gathered quadword gets right shifted by two lane positions.
1716  *
1717  */
1718 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1719                                         Register base, Register idx_base,
1720                                         Register offset, Register mask,
1721                                         XMMRegister xtmp1, XMMRegister xtmp2,
1722                                         XMMRegister temp_dst, Register rtmp,
1723                                         Register mask_idx, Register length,
1724                                         int vector_len, int vlen_enc) {
1725   Label GATHER8_LOOP;
1726   assert(is_subword_type(elem_ty), "");
1727   movl(length, vector_len);
1728   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1729   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1730   vallones(xtmp2, vlen_enc);
1731   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1732   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1733   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1734 
1735   bind(GATHER8_LOOP);
1736     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1737     if (mask == noreg) {
1738       vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc);
1739     } else {
1740       vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc);
1741     }
1742     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1743     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1744     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1745     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1746     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1747     vpor(dst, dst, temp_dst, vlen_enc);
1748     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1749     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1750     jcc(Assembler::notEqual, GATHER8_LOOP);
1751 }
1752 
1753 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1754   switch(typ) {
1755     case T_INT:
1756       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1757       break;
1758     case T_FLOAT:
1759       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1760       break;
1761     case T_LONG:
1762       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1763       break;
1764     case T_DOUBLE:
1765       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1766       break;
1767     default:
1768       assert(false,"Should not reach here.");
1769       break;
1770   }
1771 }
1772 
1773 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1774   switch(typ) {
1775     case T_INT:
1776       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1777       break;
1778     case T_FLOAT:
1779       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1780       break;
1781     case T_LONG:
1782       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1783       break;
1784     case T_DOUBLE:
1785       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1786       break;
1787     default:
1788       assert(false,"Should not reach here.");
1789       break;
1790   }
1791 }
1792 
1793 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1794   switch(typ) {
1795     case T_INT:
1796       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1797       break;
1798     case T_FLOAT:
1799       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1800       break;
1801     case T_LONG:
1802       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1803       break;
1804     case T_DOUBLE:
1805       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1806       break;
1807     default:
1808       assert(false,"Should not reach here.");
1809       break;
1810   }
1811 }
1812 
1813 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1814   if (vlen_in_bytes <= 16) {
1815     pxor (dst, dst);
1816     psubb(dst, src);
1817     switch (elem_bt) {
1818       case T_BYTE:   /* nothing to do */ break;
1819       case T_SHORT:  pmovsxbw(dst, dst); break;
1820       case T_INT:    pmovsxbd(dst, dst); break;
1821       case T_FLOAT:  pmovsxbd(dst, dst); break;
1822       case T_LONG:   pmovsxbq(dst, dst); break;
1823       case T_DOUBLE: pmovsxbq(dst, dst); break;
1824 
1825       default: assert(false, "%s", type2name(elem_bt));
1826     }
1827   } else {
1828     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1829     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1830 
1831     vpxor (dst, dst, dst, vlen_enc);
1832     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1833 
1834     switch (elem_bt) {
1835       case T_BYTE:   /* nothing to do */            break;
1836       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1837       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1838       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1839       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1840       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1841 
1842       default: assert(false, "%s", type2name(elem_bt));
1843     }
1844   }
1845 }
1846 
1847 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1848   if (novlbwdq) {
1849     vpmovsxbd(xtmp, src, vlen_enc);
1850     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1851             Assembler::eq, true, vlen_enc, noreg);
1852   } else {
1853     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1854     vpsubb(xtmp, xtmp, src, vlen_enc);
1855     evpmovb2m(dst, xtmp, vlen_enc);
1856   }
1857 }
1858 
1859 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1860   if (is_integral_type(bt)) {
1861     switch (vlen_in_bytes) {
1862       case 4:  movdl(dst, src);   break;
1863       case 8:  movq(dst, src);    break;
1864       case 16: movdqu(dst, src);  break;
1865       case 32: vmovdqu(dst, src); break;
1866       case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1867       default: ShouldNotReachHere();
1868     }
1869   } else {
1870     switch (vlen_in_bytes) {
1871       case 4:  movflt(dst, src); break;
1872       case 8:  movdbl(dst, src); break;
1873       case 16: movups(dst, src); break;
1874       case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1875       case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1876       default: ShouldNotReachHere();
1877     }
1878   }
1879 }
1880 
1881 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1882   assert(rscratch != noreg || always_reachable(src), "missing");
1883 
1884   if (reachable(src)) {
1885     load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1886   } else {
1887     lea(rscratch, src);
1888     load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1889   }
1890 }
1891 
1892 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1893   int vlen_enc = vector_length_encoding(vlen);
1894   if (VM_Version::supports_avx()) {
1895     if (bt == T_LONG) {
1896       if (VM_Version::supports_avx2()) {
1897         vpbroadcastq(dst, src, vlen_enc);
1898       } else {
1899         vmovddup(dst, src, vlen_enc);
1900       }
1901     } else if (bt == T_DOUBLE) {
1902       if (vlen_enc != Assembler::AVX_128bit) {
1903         vbroadcastsd(dst, src, vlen_enc, noreg);
1904       } else {
1905         vmovddup(dst, src, vlen_enc);
1906       }
1907     } else {
1908       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1909         vpbroadcastd(dst, src, vlen_enc);
1910       } else {
1911         vbroadcastss(dst, src, vlen_enc);
1912       }
1913     }
1914   } else if (VM_Version::supports_sse3()) {
1915     movddup(dst, src);
1916   } else {
1917     load_vector(bt, dst, src, vlen);
1918   }
1919 }
1920 
1921 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1922   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1923   int offset = exact_log2(type2aelembytes(bt)) << 6;
1924   if (is_floating_point_type(bt)) {
1925     offset += 128;
1926   }
1927   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1928   load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1929 }
1930 
1931 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1932 
1933 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1934   int vector_len = Assembler::AVX_128bit;
1935 
1936   switch (opcode) {
1937     case Op_AndReductionV:  pand(dst, src); break;
1938     case Op_OrReductionV:   por (dst, src); break;
1939     case Op_XorReductionV:  pxor(dst, src); break;
1940     case Op_MinReductionV:
1941       switch (typ) {
1942         case T_BYTE:        pminsb(dst, src); break;
1943         case T_SHORT:       pminsw(dst, src); break;
1944         case T_INT:         pminsd(dst, src); break;
1945         case T_LONG:        assert(UseAVX > 2, "required");
1946                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1947         default:            assert(false, "wrong type");
1948       }
1949       break;
1950     case Op_MaxReductionV:
1951       switch (typ) {
1952         case T_BYTE:        pmaxsb(dst, src); break;
1953         case T_SHORT:       pmaxsw(dst, src); break;
1954         case T_INT:         pmaxsd(dst, src); break;
1955         case T_LONG:        assert(UseAVX > 2, "required");
1956                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1957         default:            assert(false, "wrong type");
1958       }
1959       break;
1960     case Op_AddReductionVF: addss(dst, src); break;
1961     case Op_AddReductionVD: addsd(dst, src); break;
1962     case Op_AddReductionVI:
1963       switch (typ) {
1964         case T_BYTE:        paddb(dst, src); break;
1965         case T_SHORT:       paddw(dst, src); break;
1966         case T_INT:         paddd(dst, src); break;
1967         default:            assert(false, "wrong type");
1968       }
1969       break;
1970     case Op_AddReductionVL: paddq(dst, src); break;
1971     case Op_MulReductionVF: mulss(dst, src); break;
1972     case Op_MulReductionVD: mulsd(dst, src); break;
1973     case Op_MulReductionVI:
1974       switch (typ) {
1975         case T_SHORT:       pmullw(dst, src); break;
1976         case T_INT:         pmulld(dst, src); break;
1977         default:            assert(false, "wrong type");
1978       }
1979       break;
1980     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1981                             evpmullq(dst, dst, src, vector_len); break;
1982     default:                assert(false, "wrong opcode");
1983   }
1984 }
1985 
1986 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1987   switch (opcode) {
1988     case Op_AddReductionVF: addps(dst, src); break;
1989     case Op_AddReductionVD: addpd(dst, src); break;
1990     case Op_MulReductionVF: mulps(dst, src); break;
1991     case Op_MulReductionVD: mulpd(dst, src); break;
1992     default:                assert(false, "%s", NodeClassNames[opcode]);
1993   }
1994 }
1995 
1996 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1997   int vector_len = Assembler::AVX_256bit;
1998 
1999   switch (opcode) {
2000     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
2001     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
2002     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
2003     case Op_MinReductionV:
2004       switch (typ) {
2005         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
2006         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
2007         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
2008         case T_LONG:        assert(UseAVX > 2, "required");
2009                             vpminsq(dst, src1, src2, vector_len); break;
2010         default:            assert(false, "wrong type");
2011       }
2012       break;
2013     case Op_MaxReductionV:
2014       switch (typ) {
2015         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
2016         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
2017         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
2018         case T_LONG:        assert(UseAVX > 2, "required");
2019                             vpmaxsq(dst, src1, src2, vector_len); break;
2020         default:            assert(false, "wrong type");
2021       }
2022       break;
2023     case Op_AddReductionVI:
2024       switch (typ) {
2025         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
2026         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
2027         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
2028         default:            assert(false, "wrong type");
2029       }
2030       break;
2031     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
2032     case Op_MulReductionVI:
2033       switch (typ) {
2034         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
2035         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
2036         default:            assert(false, "wrong type");
2037       }
2038       break;
2039     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
2040     default:                assert(false, "wrong opcode");
2041   }
2042 }
2043 
2044 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
2045   int vector_len = Assembler::AVX_256bit;
2046 
2047   switch (opcode) {
2048     case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
2049     case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
2050     case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
2051     case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
2052     default:                assert(false, "%s", NodeClassNames[opcode]);
2053   }
2054 }
2055 
2056 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
2057                                   XMMRegister dst, XMMRegister src,
2058                                   XMMRegister vtmp1, XMMRegister vtmp2) {
2059   switch (opcode) {
2060     case Op_AddReductionVF:
2061     case Op_MulReductionVF:
2062       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
2063       break;
2064 
2065     case Op_AddReductionVD:
2066     case Op_MulReductionVD:
2067       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
2068       break;
2069 
2070     default: assert(false, "wrong opcode");
2071   }
2072 }
2073 
2074 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
2075                                             XMMRegister dst, XMMRegister src,
2076                                             XMMRegister vtmp1, XMMRegister vtmp2) {
2077   switch (opcode) {
2078     case Op_AddReductionVF:
2079     case Op_MulReductionVF:
2080       unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
2081       break;
2082 
2083     case Op_AddReductionVD:
2084     case Op_MulReductionVD:
2085       unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
2086       break;
2087 
2088     default: assert(false, "%s", NodeClassNames[opcode]);
2089   }
2090 }
2091 
2092 void C2_MacroAssembler::reduceB(int opcode, int vlen,
2093                              Register dst, Register src1, XMMRegister src2,
2094                              XMMRegister vtmp1, XMMRegister vtmp2) {
2095   switch (vlen) {
2096     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2097     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2098     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2099     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2100 
2101     default: assert(false, "wrong vector length");
2102   }
2103 }
2104 
2105 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
2106                              Register dst, Register src1, XMMRegister src2,
2107                              XMMRegister vtmp1, XMMRegister vtmp2) {
2108   switch (vlen) {
2109     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2110     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2111     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2112     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2113 
2114     default: assert(false, "wrong vector length");
2115   }
2116 }
2117 
2118 void C2_MacroAssembler::reduceS(int opcode, int vlen,
2119                              Register dst, Register src1, XMMRegister src2,
2120                              XMMRegister vtmp1, XMMRegister vtmp2) {
2121   switch (vlen) {
2122     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2123     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2124     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2125     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2126 
2127     default: assert(false, "wrong vector length");
2128   }
2129 }
2130 
2131 void C2_MacroAssembler::reduceI(int opcode, int vlen,
2132                              Register dst, Register src1, XMMRegister src2,
2133                              XMMRegister vtmp1, XMMRegister vtmp2) {
2134   switch (vlen) {
2135     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2136     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2137     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2138     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2139 
2140     default: assert(false, "wrong vector length");
2141   }
2142 }
2143 
2144 void C2_MacroAssembler::reduceL(int opcode, int vlen,
2145                              Register dst, Register src1, XMMRegister src2,
2146                              XMMRegister vtmp1, XMMRegister vtmp2) {
2147   switch (vlen) {
2148     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2149     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2150     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2151 
2152     default: assert(false, "wrong vector length");
2153   }
2154 }
2155 
2156 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2157   switch (vlen) {
2158     case 2:
2159       assert(vtmp2 == xnoreg, "");
2160       reduce2F(opcode, dst, src, vtmp1);
2161       break;
2162     case 4:
2163       assert(vtmp2 == xnoreg, "");
2164       reduce4F(opcode, dst, src, vtmp1);
2165       break;
2166     case 8:
2167       reduce8F(opcode, dst, src, vtmp1, vtmp2);
2168       break;
2169     case 16:
2170       reduce16F(opcode, dst, src, vtmp1, vtmp2);
2171       break;
2172     default: assert(false, "wrong vector length");
2173   }
2174 }
2175 
2176 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2177   switch (vlen) {
2178     case 2:
2179       assert(vtmp2 == xnoreg, "");
2180       reduce2D(opcode, dst, src, vtmp1);
2181       break;
2182     case 4:
2183       reduce4D(opcode, dst, src, vtmp1, vtmp2);
2184       break;
2185     case 8:
2186       reduce8D(opcode, dst, src, vtmp1, vtmp2);
2187       break;
2188     default: assert(false, "wrong vector length");
2189   }
2190 }
2191 
2192 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2193   switch (vlen) {
2194     case 2:
2195       assert(vtmp1 == xnoreg, "");
2196       assert(vtmp2 == xnoreg, "");
2197       unorderedReduce2F(opcode, dst, src);
2198       break;
2199     case 4:
2200       assert(vtmp2 == xnoreg, "");
2201       unorderedReduce4F(opcode, dst, src, vtmp1);
2202       break;
2203     case 8:
2204       unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
2205       break;
2206     case 16:
2207       unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
2208       break;
2209     default: assert(false, "wrong vector length");
2210   }
2211 }
2212 
2213 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2214   switch (vlen) {
2215     case 2:
2216       assert(vtmp1 == xnoreg, "");
2217       assert(vtmp2 == xnoreg, "");
2218       unorderedReduce2D(opcode, dst, src);
2219       break;
2220     case 4:
2221       assert(vtmp2 == xnoreg, "");
2222       unorderedReduce4D(opcode, dst, src, vtmp1);
2223       break;
2224     case 8:
2225       unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2226       break;
2227     default: assert(false, "wrong vector length");
2228   }
2229 }
2230 
2231 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2232   if (opcode == Op_AddReductionVI) {
2233     if (vtmp1 != src2) {
2234       movdqu(vtmp1, src2);
2235     }
2236     phaddd(vtmp1, vtmp1);
2237   } else {
2238     pshufd(vtmp1, src2, 0x1);
2239     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2240   }
2241   movdl(vtmp2, src1);
2242   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2243   movdl(dst, vtmp1);
2244 }
2245 
2246 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2247   if (opcode == Op_AddReductionVI) {
2248     if (vtmp1 != src2) {
2249       movdqu(vtmp1, src2);
2250     }
2251     phaddd(vtmp1, src2);
2252     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2253   } else {
2254     pshufd(vtmp2, src2, 0xE);
2255     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2256     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2257   }
2258 }
2259 
2260 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2261   if (opcode == Op_AddReductionVI) {
2262     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2263     vextracti128_high(vtmp2, vtmp1);
2264     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2265     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2266   } else {
2267     vextracti128_high(vtmp1, src2);
2268     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2269     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2270   }
2271 }
2272 
2273 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2274   vextracti64x4_high(vtmp2, src2);
2275   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2276   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2277 }
2278 
2279 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2280   pshufd(vtmp2, src2, 0x1);
2281   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2282   movdqu(vtmp1, vtmp2);
2283   psrldq(vtmp1, 2);
2284   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2285   movdqu(vtmp2, vtmp1);
2286   psrldq(vtmp2, 1);
2287   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2288   movdl(vtmp2, src1);
2289   pmovsxbd(vtmp1, vtmp1);
2290   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2291   pextrb(dst, vtmp1, 0x0);
2292   movsbl(dst, dst);
2293 }
2294 
2295 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2296   pshufd(vtmp1, src2, 0xE);
2297   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2298   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2299 }
2300 
2301 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2302   vextracti128_high(vtmp2, src2);
2303   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2304   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2305 }
2306 
2307 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2308   vextracti64x4_high(vtmp1, src2);
2309   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2310   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2311 }
2312 
2313 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2314   pmovsxbw(vtmp2, src2);
2315   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2316 }
2317 
2318 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2319   if (UseAVX > 1) {
2320     int vector_len = Assembler::AVX_256bit;
2321     vpmovsxbw(vtmp1, src2, vector_len);
2322     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2323   } else {
2324     pmovsxbw(vtmp2, src2);
2325     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2326     pshufd(vtmp2, src2, 0x1);
2327     pmovsxbw(vtmp2, src2);
2328     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2329   }
2330 }
2331 
2332 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2333   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2334     int vector_len = Assembler::AVX_512bit;
2335     vpmovsxbw(vtmp1, src2, vector_len);
2336     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2337   } else {
2338     assert(UseAVX >= 2,"Should not reach here.");
2339     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2340     vextracti128_high(vtmp2, src2);
2341     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2342   }
2343 }
2344 
2345 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2346   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2347   vextracti64x4_high(vtmp2, src2);
2348   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2349 }
2350 
2351 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2352   if (opcode == Op_AddReductionVI) {
2353     if (vtmp1 != src2) {
2354       movdqu(vtmp1, src2);
2355     }
2356     phaddw(vtmp1, vtmp1);
2357     phaddw(vtmp1, vtmp1);
2358   } else {
2359     pshufd(vtmp2, src2, 0x1);
2360     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2361     movdqu(vtmp1, vtmp2);
2362     psrldq(vtmp1, 2);
2363     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2364   }
2365   movdl(vtmp2, src1);
2366   pmovsxwd(vtmp1, vtmp1);
2367   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2368   pextrw(dst, vtmp1, 0x0);
2369   movswl(dst, dst);
2370 }
2371 
2372 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2373   if (opcode == Op_AddReductionVI) {
2374     if (vtmp1 != src2) {
2375       movdqu(vtmp1, src2);
2376     }
2377     phaddw(vtmp1, src2);
2378   } else {
2379     pshufd(vtmp1, src2, 0xE);
2380     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2381   }
2382   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2383 }
2384 
2385 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2386   if (opcode == Op_AddReductionVI) {
2387     int vector_len = Assembler::AVX_256bit;
2388     vphaddw(vtmp2, src2, src2, vector_len);
2389     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2390   } else {
2391     vextracti128_high(vtmp2, src2);
2392     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2393   }
2394   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2395 }
2396 
2397 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2398   int vector_len = Assembler::AVX_256bit;
2399   vextracti64x4_high(vtmp1, src2);
2400   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2401   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2402 }
2403 
2404 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2405   pshufd(vtmp2, src2, 0xE);
2406   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2407   movdq(vtmp1, src1);
2408   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2409   movdq(dst, vtmp1);
2410 }
2411 
2412 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2413   vextracti128_high(vtmp1, src2);
2414   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2415   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2416 }
2417 
2418 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2419   vextracti64x4_high(vtmp2, src2);
2420   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2421   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2422 }
2423 
2424 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2425   mov64(temp, -1L);
2426   bzhiq(temp, temp, len);
2427   kmovql(dst, temp);
2428 }
2429 
2430 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2431   reduce_operation_128(T_FLOAT, opcode, dst, src);
2432   pshufd(vtmp, src, 0x1);
2433   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2434 }
2435 
2436 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2437   reduce2F(opcode, dst, src, vtmp);
2438   pshufd(vtmp, src, 0x2);
2439   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2440   pshufd(vtmp, src, 0x3);
2441   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2442 }
2443 
2444 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2445   reduce4F(opcode, dst, src, vtmp2);
2446   vextractf128_high(vtmp2, src);
2447   reduce4F(opcode, dst, vtmp2, vtmp1);
2448 }
2449 
2450 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2451   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2452   vextracti64x4_high(vtmp1, src);
2453   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2454 }
2455 
2456 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2457   pshufd(dst, src, 0x1);
2458   reduce_operation_128(T_FLOAT, opcode, dst, src);
2459 }
2460 
2461 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2462   pshufd(vtmp, src, 0xE);
2463   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2464   unorderedReduce2F(opcode, dst, vtmp);
2465 }
2466 
2467 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2468   vextractf128_high(vtmp1, src);
2469   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2470   unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2471 }
2472 
2473 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2474   vextractf64x4_high(vtmp2, src);
2475   unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2476   unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2477 }
2478 
2479 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2480   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2481   pshufd(vtmp, src, 0xE);
2482   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2483 }
2484 
2485 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2486   reduce2D(opcode, dst, src, vtmp2);
2487   vextractf128_high(vtmp2, src);
2488   reduce2D(opcode, dst, vtmp2, vtmp1);
2489 }
2490 
2491 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2492   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2493   vextracti64x4_high(vtmp1, src);
2494   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2495 }
2496 
2497 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2498   pshufd(dst, src, 0xE);
2499   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2500 }
2501 
2502 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2503   vextractf128_high(vtmp, src);
2504   unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2505   unorderedReduce2D(opcode, dst, vtmp);
2506 }
2507 
2508 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2509   vextractf64x4_high(vtmp2, src);
2510   unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2511   unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2512 }
2513 
2514 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2515   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2516 }
2517 
2518 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2519   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2520 }
2521 
2522 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2523   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2524 }
2525 
2526 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2527                                  int vec_enc) {
2528   switch(elem_bt) {
2529     case T_INT:
2530     case T_FLOAT:
2531       vmaskmovps(dst, src, mask, vec_enc);
2532       break;
2533     case T_LONG:
2534     case T_DOUBLE:
2535       vmaskmovpd(dst, src, mask, vec_enc);
2536       break;
2537     default:
2538       fatal("Unsupported type %s", type2name(elem_bt));
2539       break;
2540   }
2541 }
2542 
2543 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2544                                  int vec_enc) {
2545   switch(elem_bt) {
2546     case T_INT:
2547     case T_FLOAT:
2548       vmaskmovps(dst, src, mask, vec_enc);
2549       break;
2550     case T_LONG:
2551     case T_DOUBLE:
2552       vmaskmovpd(dst, src, mask, vec_enc);
2553       break;
2554     default:
2555       fatal("Unsupported type %s", type2name(elem_bt));
2556       break;
2557   }
2558 }
2559 
2560 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2561                                           XMMRegister dst, XMMRegister src,
2562                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2563                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2564   const int permconst[] = {1, 14};
2565   XMMRegister wsrc = src;
2566   XMMRegister wdst = xmm_0;
2567   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2568 
2569   int vlen_enc = Assembler::AVX_128bit;
2570   if (vlen == 16) {
2571     vlen_enc = Assembler::AVX_256bit;
2572   }
2573 
2574   for (int i = log2(vlen) - 1; i >=0; i--) {
2575     if (i == 0 && !is_dst_valid) {
2576       wdst = dst;
2577     }
2578     if (i == 3) {
2579       vextracti64x4_high(wtmp, wsrc);
2580     } else if (i == 2) {
2581       vextracti128_high(wtmp, wsrc);
2582     } else { // i = [0,1]
2583       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2584     }
2585 
2586     if (VM_Version::supports_avx10_2()) {
2587       vminmax_fp(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc);
2588     } else {
2589       vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2590     }
2591     wsrc = wdst;
2592     vlen_enc = Assembler::AVX_128bit;
2593   }
2594   if (is_dst_valid) {
2595     if (VM_Version::supports_avx10_2()) {
2596       vminmax_fp(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit);
2597     } else {
2598       vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2599     }
2600   }
2601 }
2602 
2603 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2604                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2605                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2606   XMMRegister wsrc = src;
2607   XMMRegister wdst = xmm_0;
2608   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2609   int vlen_enc = Assembler::AVX_128bit;
2610   if (vlen == 8) {
2611     vlen_enc = Assembler::AVX_256bit;
2612   }
2613   for (int i = log2(vlen) - 1; i >=0; i--) {
2614     if (i == 0 && !is_dst_valid) {
2615       wdst = dst;
2616     }
2617     if (i == 1) {
2618       vextracti128_high(wtmp, wsrc);
2619     } else if (i == 2) {
2620       vextracti64x4_high(wtmp, wsrc);
2621     } else {
2622       assert(i == 0, "%d", i);
2623       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2624     }
2625 
2626     if (VM_Version::supports_avx10_2()) {
2627       vminmax_fp(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc);
2628     } else {
2629       vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2630     }
2631 
2632     wsrc = wdst;
2633     vlen_enc = Assembler::AVX_128bit;
2634   }
2635 
2636   if (is_dst_valid) {
2637     if (VM_Version::supports_avx10_2()) {
2638       vminmax_fp(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit);
2639     } else {
2640       vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2641     }
2642   }
2643 }
2644 
2645 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2646   switch (bt) {
2647     case T_BYTE:  pextrb(dst, src, idx); break;
2648     case T_SHORT: pextrw(dst, src, idx); break;
2649     case T_INT:   pextrd(dst, src, idx); break;
2650     case T_LONG:  pextrq(dst, src, idx); break;
2651 
2652     default:
2653       assert(false,"Should not reach here.");
2654       break;
2655   }
2656 }
2657 
2658 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2659   int esize =  type2aelembytes(typ);
2660   int elem_per_lane = 16/esize;
2661   int lane = elemindex / elem_per_lane;
2662   int eindex = elemindex % elem_per_lane;
2663 
2664   if (lane >= 2) {
2665     assert(UseAVX > 2, "required");
2666     vextractf32x4(dst, src, lane & 3);
2667     return dst;
2668   } else if (lane > 0) {
2669     assert(UseAVX > 0, "required");
2670     vextractf128(dst, src, lane);
2671     return dst;
2672   } else {
2673     return src;
2674   }
2675 }
2676 
2677 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2678   if (typ == T_BYTE) {
2679     movsbl(dst, dst);
2680   } else if (typ == T_SHORT) {
2681     movswl(dst, dst);
2682   }
2683 }
2684 
2685 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2686   int esize =  type2aelembytes(typ);
2687   int elem_per_lane = 16/esize;
2688   int eindex = elemindex % elem_per_lane;
2689   assert(is_integral_type(typ),"required");
2690 
2691   if (eindex == 0) {
2692     if (typ == T_LONG) {
2693       movq(dst, src);
2694     } else {
2695       movdl(dst, src);
2696       movsxl(typ, dst);
2697     }
2698   } else {
2699     extract(typ, dst, src, eindex);
2700     movsxl(typ, dst);
2701   }
2702 }
2703 
2704 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2705   int esize =  type2aelembytes(typ);
2706   int elem_per_lane = 16/esize;
2707   int eindex = elemindex % elem_per_lane;
2708   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2709 
2710   if (eindex == 0) {
2711     movq(dst, src);
2712   } else {
2713     if (typ == T_FLOAT) {
2714       if (UseAVX == 0) {
2715         movdqu(dst, src);
2716         shufps(dst, dst, eindex);
2717       } else {
2718         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2719       }
2720     } else {
2721       if (UseAVX == 0) {
2722         movdqu(dst, src);
2723         psrldq(dst, eindex*esize);
2724       } else {
2725         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2726       }
2727       movq(dst, dst);
2728     }
2729   }
2730   // Zero upper bits
2731   if (typ == T_FLOAT) {
2732     if (UseAVX == 0) {
2733       assert(vtmp != xnoreg, "required.");
2734       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2735       pand(dst, vtmp);
2736     } else {
2737       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2738     }
2739   }
2740 }
2741 
2742 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2743   switch(typ) {
2744     case T_BYTE:
2745     case T_BOOLEAN:
2746       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2747       break;
2748     case T_SHORT:
2749     case T_CHAR:
2750       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2751       break;
2752     case T_INT:
2753     case T_FLOAT:
2754       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2755       break;
2756     case T_LONG:
2757     case T_DOUBLE:
2758       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2759       break;
2760     default:
2761       assert(false,"Should not reach here.");
2762       break;
2763   }
2764 }
2765 
2766 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2767   assert(rscratch != noreg || always_reachable(src2), "missing");
2768 
2769   switch(typ) {
2770     case T_BOOLEAN:
2771     case T_BYTE:
2772       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2773       break;
2774     case T_CHAR:
2775     case T_SHORT:
2776       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2777       break;
2778     case T_INT:
2779     case T_FLOAT:
2780       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2781       break;
2782     case T_LONG:
2783     case T_DOUBLE:
2784       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2785       break;
2786     default:
2787       assert(false,"Should not reach here.");
2788       break;
2789   }
2790 }
2791 
2792 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2793   switch(typ) {
2794     case T_BYTE:
2795       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2796       break;
2797     case T_SHORT:
2798       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2799       break;
2800     case T_INT:
2801     case T_FLOAT:
2802       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2803       break;
2804     case T_LONG:
2805     case T_DOUBLE:
2806       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2807       break;
2808     default:
2809       assert(false,"Should not reach here.");
2810       break;
2811   }
2812 }
2813 
2814 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2815   assert(vlen_in_bytes <= 32, "");
2816   int esize = type2aelembytes(bt);
2817   if (vlen_in_bytes == 32) {
2818     assert(vtmp == xnoreg, "required.");
2819     if (esize >= 4) {
2820       vtestps(src1, src2, AVX_256bit);
2821     } else {
2822       vptest(src1, src2, AVX_256bit);
2823     }
2824     return;
2825   }
2826   if (vlen_in_bytes < 16) {
2827     // Duplicate the lower part to fill the whole register,
2828     // Don't need to do so for src2
2829     assert(vtmp != xnoreg, "required");
2830     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2831     pshufd(vtmp, src1, shuffle_imm);
2832   } else {
2833     assert(vtmp == xnoreg, "required");
2834     vtmp = src1;
2835   }
2836   if (esize >= 4 && VM_Version::supports_avx()) {
2837     vtestps(vtmp, src2, AVX_128bit);
2838   } else {
2839     ptest(vtmp, src2);
2840   }
2841 }
2842 
2843 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2844 #ifdef ASSERT
2845   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2846   bool is_bw_supported = VM_Version::supports_avx512bw();
2847   if (is_bw && !is_bw_supported) {
2848     assert(vlen_enc != Assembler::AVX_512bit, "required");
2849     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2850            "XMM register should be 0-15");
2851   }
2852 #endif // ASSERT
2853   switch (elem_bt) {
2854     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2855     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2856     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2857     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2858     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2859     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2860     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2861   }
2862 }
2863 
2864 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2865   assert(UseAVX >= 2, "required");
2866   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2867   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2868   if ((UseAVX > 2) &&
2869       (!is_bw || VM_Version::supports_avx512bw()) &&
2870       (!is_vl || VM_Version::supports_avx512vl())) {
2871     switch (elem_bt) {
2872       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2873       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2874       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2875       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2876       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2877     }
2878   } else {
2879     assert(vlen_enc != Assembler::AVX_512bit, "required");
2880     assert((dst->encoding() < 16),"XMM register should be 0-15");
2881     switch (elem_bt) {
2882       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2883       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2884       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2885       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2886       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2887       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2888       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2889     }
2890   }
2891 }
2892 
2893 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2894   switch (to_elem_bt) {
2895     case T_SHORT:
2896       vpmovsxbw(dst, src, vlen_enc);
2897       break;
2898     case T_INT:
2899       vpmovsxbd(dst, src, vlen_enc);
2900       break;
2901     case T_FLOAT:
2902       vpmovsxbd(dst, src, vlen_enc);
2903       vcvtdq2ps(dst, dst, vlen_enc);
2904       break;
2905     case T_LONG:
2906       vpmovsxbq(dst, src, vlen_enc);
2907       break;
2908     case T_DOUBLE: {
2909       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2910       vpmovsxbd(dst, src, mid_vlen_enc);
2911       vcvtdq2pd(dst, dst, vlen_enc);
2912       break;
2913     }
2914     default:
2915       fatal("Unsupported type %s", type2name(to_elem_bt));
2916       break;
2917   }
2918 }
2919 
2920 //-------------------------------------------------------------------------------------------
2921 
2922 // IndexOf for constant substrings with size >= 8 chars
2923 // which don't need to be loaded through stack.
2924 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2925                                          Register cnt1, Register cnt2,
2926                                          int int_cnt2,  Register result,
2927                                          XMMRegister vec, Register tmp,
2928                                          int ae) {
2929   ShortBranchVerifier sbv(this);
2930   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2931   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2932 
2933   // This method uses the pcmpestri instruction with bound registers
2934   //   inputs:
2935   //     xmm - substring
2936   //     rax - substring length (elements count)
2937   //     mem - scanned string
2938   //     rdx - string length (elements count)
2939   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2940   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2941   //   outputs:
2942   //     rcx - matched index in string
2943   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2944   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2945   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2946   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2947   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2948 
2949   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2950         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2951         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2952 
2953   // Note, inline_string_indexOf() generates checks:
2954   // if (substr.count > string.count) return -1;
2955   // if (substr.count == 0) return 0;
2956   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2957 
2958   // Load substring.
2959   if (ae == StrIntrinsicNode::UL) {
2960     pmovzxbw(vec, Address(str2, 0));
2961   } else {
2962     movdqu(vec, Address(str2, 0));
2963   }
2964   movl(cnt2, int_cnt2);
2965   movptr(result, str1); // string addr
2966 
2967   if (int_cnt2 > stride) {
2968     jmpb(SCAN_TO_SUBSTR);
2969 
2970     // Reload substr for rescan, this code
2971     // is executed only for large substrings (> 8 chars)
2972     bind(RELOAD_SUBSTR);
2973     if (ae == StrIntrinsicNode::UL) {
2974       pmovzxbw(vec, Address(str2, 0));
2975     } else {
2976       movdqu(vec, Address(str2, 0));
2977     }
2978     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2979 
2980     bind(RELOAD_STR);
2981     // We came here after the beginning of the substring was
2982     // matched but the rest of it was not so we need to search
2983     // again. Start from the next element after the previous match.
2984 
2985     // cnt2 is number of substring reminding elements and
2986     // cnt1 is number of string reminding elements when cmp failed.
2987     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2988     subl(cnt1, cnt2);
2989     addl(cnt1, int_cnt2);
2990     movl(cnt2, int_cnt2); // Now restore cnt2
2991 
2992     decrementl(cnt1);     // Shift to next element
2993     cmpl(cnt1, cnt2);
2994     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2995 
2996     addptr(result, (1<<scale1));
2997 
2998   } // (int_cnt2 > 8)
2999 
3000   // Scan string for start of substr in 16-byte vectors
3001   bind(SCAN_TO_SUBSTR);
3002   pcmpestri(vec, Address(result, 0), mode);
3003   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3004   subl(cnt1, stride);
3005   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3006   cmpl(cnt1, cnt2);
3007   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3008   addptr(result, 16);
3009   jmpb(SCAN_TO_SUBSTR);
3010 
3011   // Found a potential substr
3012   bind(FOUND_CANDIDATE);
3013   // Matched whole vector if first element matched (tmp(rcx) == 0).
3014   if (int_cnt2 == stride) {
3015     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
3016   } else { // int_cnt2 > 8
3017     jccb(Assembler::overflow, FOUND_SUBSTR);
3018   }
3019   // After pcmpestri tmp(rcx) contains matched element index
3020   // Compute start addr of substr
3021   lea(result, Address(result, tmp, scale1));
3022 
3023   // Make sure string is still long enough
3024   subl(cnt1, tmp);
3025   cmpl(cnt1, cnt2);
3026   if (int_cnt2 == stride) {
3027     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3028   } else { // int_cnt2 > 8
3029     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
3030   }
3031   // Left less then substring.
3032 
3033   bind(RET_NOT_FOUND);
3034   movl(result, -1);
3035   jmp(EXIT);
3036 
3037   if (int_cnt2 > stride) {
3038     // This code is optimized for the case when whole substring
3039     // is matched if its head is matched.
3040     bind(MATCH_SUBSTR_HEAD);
3041     pcmpestri(vec, Address(result, 0), mode);
3042     // Reload only string if does not match
3043     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
3044 
3045     Label CONT_SCAN_SUBSTR;
3046     // Compare the rest of substring (> 8 chars).
3047     bind(FOUND_SUBSTR);
3048     // First 8 chars are already matched.
3049     negptr(cnt2);
3050     addptr(cnt2, stride);
3051 
3052     bind(SCAN_SUBSTR);
3053     subl(cnt1, stride);
3054     cmpl(cnt2, -stride); // Do not read beyond substring
3055     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
3056     // Back-up strings to avoid reading beyond substring:
3057     // cnt1 = cnt1 - cnt2 + 8
3058     addl(cnt1, cnt2); // cnt2 is negative
3059     addl(cnt1, stride);
3060     movl(cnt2, stride); negptr(cnt2);
3061     bind(CONT_SCAN_SUBSTR);
3062     if (int_cnt2 < (int)G) {
3063       int tail_off1 = int_cnt2<<scale1;
3064       int tail_off2 = int_cnt2<<scale2;
3065       if (ae == StrIntrinsicNode::UL) {
3066         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
3067       } else {
3068         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
3069       }
3070       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
3071     } else {
3072       // calculate index in register to avoid integer overflow (int_cnt2*2)
3073       movl(tmp, int_cnt2);
3074       addptr(tmp, cnt2);
3075       if (ae == StrIntrinsicNode::UL) {
3076         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
3077       } else {
3078         movdqu(vec, Address(str2, tmp, scale2, 0));
3079       }
3080       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
3081     }
3082     // Need to reload strings pointers if not matched whole vector
3083     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3084     addptr(cnt2, stride);
3085     jcc(Assembler::negative, SCAN_SUBSTR);
3086     // Fall through if found full substring
3087 
3088   } // (int_cnt2 > 8)
3089 
3090   bind(RET_FOUND);
3091   // Found result if we matched full small substring.
3092   // Compute substr offset
3093   subptr(result, str1);
3094   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3095     shrl(result, 1); // index
3096   }
3097   bind(EXIT);
3098 
3099 } // string_indexofC8
3100 
3101 // Small strings are loaded through stack if they cross page boundary.
3102 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
3103                                        Register cnt1, Register cnt2,
3104                                        int int_cnt2,  Register result,
3105                                        XMMRegister vec, Register tmp,
3106                                        int ae) {
3107   ShortBranchVerifier sbv(this);
3108   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3109   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
3110 
3111   //
3112   // int_cnt2 is length of small (< 8 chars) constant substring
3113   // or (-1) for non constant substring in which case its length
3114   // is in cnt2 register.
3115   //
3116   // Note, inline_string_indexOf() generates checks:
3117   // if (substr.count > string.count) return -1;
3118   // if (substr.count == 0) return 0;
3119   //
3120   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
3121   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
3122   // This method uses the pcmpestri instruction with bound registers
3123   //   inputs:
3124   //     xmm - substring
3125   //     rax - substring length (elements count)
3126   //     mem - scanned string
3127   //     rdx - string length (elements count)
3128   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
3129   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
3130   //   outputs:
3131   //     rcx - matched index in string
3132   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3133   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
3134   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
3135   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
3136 
3137   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
3138         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
3139         FOUND_CANDIDATE;
3140 
3141   { //========================================================
3142     // We don't know where these strings are located
3143     // and we can't read beyond them. Load them through stack.
3144     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
3145 
3146     movptr(tmp, rsp); // save old SP
3147 
3148     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
3149       if (int_cnt2 == (1>>scale2)) { // One byte
3150         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
3151         load_unsigned_byte(result, Address(str2, 0));
3152         movdl(vec, result); // move 32 bits
3153       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
3154         // Not enough header space in 32-bit VM: 12+3 = 15.
3155         movl(result, Address(str2, -1));
3156         shrl(result, 8);
3157         movdl(vec, result); // move 32 bits
3158       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
3159         load_unsigned_short(result, Address(str2, 0));
3160         movdl(vec, result); // move 32 bits
3161       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
3162         movdl(vec, Address(str2, 0)); // move 32 bits
3163       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
3164         movq(vec, Address(str2, 0));  // move 64 bits
3165       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
3166         // Array header size is 12 bytes in 32-bit VM
3167         // + 6 bytes for 3 chars == 18 bytes,
3168         // enough space to load vec and shift.
3169         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
3170         if (ae == StrIntrinsicNode::UL) {
3171           int tail_off = int_cnt2-8;
3172           pmovzxbw(vec, Address(str2, tail_off));
3173           psrldq(vec, -2*tail_off);
3174         }
3175         else {
3176           int tail_off = int_cnt2*(1<<scale2);
3177           movdqu(vec, Address(str2, tail_off-16));
3178           psrldq(vec, 16-tail_off);
3179         }
3180       }
3181     } else { // not constant substring
3182       cmpl(cnt2, stride);
3183       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3184 
3185       // We can read beyond string if srt+16 does not cross page boundary
3186       // since heaps are aligned and mapped by pages.
3187       assert(os::vm_page_size() < (int)G, "default page should be small");
3188       movl(result, str2); // We need only low 32 bits
3189       andl(result, ((int)os::vm_page_size()-1));
3190       cmpl(result, ((int)os::vm_page_size()-16));
3191       jccb(Assembler::belowEqual, CHECK_STR);
3192 
3193       // Move small strings to stack to allow load 16 bytes into vec.
3194       subptr(rsp, 16);
3195       int stk_offset = wordSize-(1<<scale2);
3196       push(cnt2);
3197 
3198       bind(COPY_SUBSTR);
3199       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3200         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3201         movb(Address(rsp, cnt2, scale2, stk_offset), result);
3202       } else if (ae == StrIntrinsicNode::UU) {
3203         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3204         movw(Address(rsp, cnt2, scale2, stk_offset), result);
3205       }
3206       decrement(cnt2);
3207       jccb(Assembler::notZero, COPY_SUBSTR);
3208 
3209       pop(cnt2);
3210       movptr(str2, rsp);  // New substring address
3211     } // non constant
3212 
3213     bind(CHECK_STR);
3214     cmpl(cnt1, stride);
3215     jccb(Assembler::aboveEqual, BIG_STRINGS);
3216 
3217     // Check cross page boundary.
3218     movl(result, str1); // We need only low 32 bits
3219     andl(result, ((int)os::vm_page_size()-1));
3220     cmpl(result, ((int)os::vm_page_size()-16));
3221     jccb(Assembler::belowEqual, BIG_STRINGS);
3222 
3223     subptr(rsp, 16);
3224     int stk_offset = -(1<<scale1);
3225     if (int_cnt2 < 0) { // not constant
3226       push(cnt2);
3227       stk_offset += wordSize;
3228     }
3229     movl(cnt2, cnt1);
3230 
3231     bind(COPY_STR);
3232     if (ae == StrIntrinsicNode::LL) {
3233       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3234       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3235     } else {
3236       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3237       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3238     }
3239     decrement(cnt2);
3240     jccb(Assembler::notZero, COPY_STR);
3241 
3242     if (int_cnt2 < 0) { // not constant
3243       pop(cnt2);
3244     }
3245     movptr(str1, rsp);  // New string address
3246 
3247     bind(BIG_STRINGS);
3248     // Load substring.
3249     if (int_cnt2 < 0) { // -1
3250       if (ae == StrIntrinsicNode::UL) {
3251         pmovzxbw(vec, Address(str2, 0));
3252       } else {
3253         movdqu(vec, Address(str2, 0));
3254       }
3255       push(cnt2);       // substr count
3256       push(str2);       // substr addr
3257       push(str1);       // string addr
3258     } else {
3259       // Small (< 8 chars) constant substrings are loaded already.
3260       movl(cnt2, int_cnt2);
3261     }
3262     push(tmp);  // original SP
3263 
3264   } // Finished loading
3265 
3266   //========================================================
3267   // Start search
3268   //
3269 
3270   movptr(result, str1); // string addr
3271 
3272   if (int_cnt2  < 0) {  // Only for non constant substring
3273     jmpb(SCAN_TO_SUBSTR);
3274 
3275     // SP saved at sp+0
3276     // String saved at sp+1*wordSize
3277     // Substr saved at sp+2*wordSize
3278     // Substr count saved at sp+3*wordSize
3279 
3280     // Reload substr for rescan, this code
3281     // is executed only for large substrings (> 8 chars)
3282     bind(RELOAD_SUBSTR);
3283     movptr(str2, Address(rsp, 2*wordSize));
3284     movl(cnt2, Address(rsp, 3*wordSize));
3285     if (ae == StrIntrinsicNode::UL) {
3286       pmovzxbw(vec, Address(str2, 0));
3287     } else {
3288       movdqu(vec, Address(str2, 0));
3289     }
3290     // We came here after the beginning of the substring was
3291     // matched but the rest of it was not so we need to search
3292     // again. Start from the next element after the previous match.
3293     subptr(str1, result); // Restore counter
3294     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3295       shrl(str1, 1);
3296     }
3297     addl(cnt1, str1);
3298     decrementl(cnt1);   // Shift to next element
3299     cmpl(cnt1, cnt2);
3300     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3301 
3302     addptr(result, (1<<scale1));
3303   } // non constant
3304 
3305   // Scan string for start of substr in 16-byte vectors
3306   bind(SCAN_TO_SUBSTR);
3307   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3308   pcmpestri(vec, Address(result, 0), mode);
3309   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3310   subl(cnt1, stride);
3311   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3312   cmpl(cnt1, cnt2);
3313   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3314   addptr(result, 16);
3315 
3316   bind(ADJUST_STR);
3317   cmpl(cnt1, stride); // Do not read beyond string
3318   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3319   // Back-up string to avoid reading beyond string.
3320   lea(result, Address(result, cnt1, scale1, -16));
3321   movl(cnt1, stride);
3322   jmpb(SCAN_TO_SUBSTR);
3323 
3324   // Found a potential substr
3325   bind(FOUND_CANDIDATE);
3326   // After pcmpestri tmp(rcx) contains matched element index
3327 
3328   // Make sure string is still long enough
3329   subl(cnt1, tmp);
3330   cmpl(cnt1, cnt2);
3331   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3332   // Left less then substring.
3333 
3334   bind(RET_NOT_FOUND);
3335   movl(result, -1);
3336   jmp(CLEANUP);
3337 
3338   bind(FOUND_SUBSTR);
3339   // Compute start addr of substr
3340   lea(result, Address(result, tmp, scale1));
3341   if (int_cnt2 > 0) { // Constant substring
3342     // Repeat search for small substring (< 8 chars)
3343     // from new point without reloading substring.
3344     // Have to check that we don't read beyond string.
3345     cmpl(tmp, stride-int_cnt2);
3346     jccb(Assembler::greater, ADJUST_STR);
3347     // Fall through if matched whole substring.
3348   } else { // non constant
3349     assert(int_cnt2 == -1, "should be != 0");
3350 
3351     addl(tmp, cnt2);
3352     // Found result if we matched whole substring.
3353     cmpl(tmp, stride);
3354     jcc(Assembler::lessEqual, RET_FOUND);
3355 
3356     // Repeat search for small substring (<= 8 chars)
3357     // from new point 'str1' without reloading substring.
3358     cmpl(cnt2, stride);
3359     // Have to check that we don't read beyond string.
3360     jccb(Assembler::lessEqual, ADJUST_STR);
3361 
3362     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3363     // Compare the rest of substring (> 8 chars).
3364     movptr(str1, result);
3365 
3366     cmpl(tmp, cnt2);
3367     // First 8 chars are already matched.
3368     jccb(Assembler::equal, CHECK_NEXT);
3369 
3370     bind(SCAN_SUBSTR);
3371     pcmpestri(vec, Address(str1, 0), mode);
3372     // Need to reload strings pointers if not matched whole vector
3373     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3374 
3375     bind(CHECK_NEXT);
3376     subl(cnt2, stride);
3377     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3378     addptr(str1, 16);
3379     if (ae == StrIntrinsicNode::UL) {
3380       addptr(str2, 8);
3381     } else {
3382       addptr(str2, 16);
3383     }
3384     subl(cnt1, stride);
3385     cmpl(cnt2, stride); // Do not read beyond substring
3386     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3387     // Back-up strings to avoid reading beyond substring.
3388 
3389     if (ae == StrIntrinsicNode::UL) {
3390       lea(str2, Address(str2, cnt2, scale2, -8));
3391       lea(str1, Address(str1, cnt2, scale1, -16));
3392     } else {
3393       lea(str2, Address(str2, cnt2, scale2, -16));
3394       lea(str1, Address(str1, cnt2, scale1, -16));
3395     }
3396     subl(cnt1, cnt2);
3397     movl(cnt2, stride);
3398     addl(cnt1, stride);
3399     bind(CONT_SCAN_SUBSTR);
3400     if (ae == StrIntrinsicNode::UL) {
3401       pmovzxbw(vec, Address(str2, 0));
3402     } else {
3403       movdqu(vec, Address(str2, 0));
3404     }
3405     jmp(SCAN_SUBSTR);
3406 
3407     bind(RET_FOUND_LONG);
3408     movptr(str1, Address(rsp, wordSize));
3409   } // non constant
3410 
3411   bind(RET_FOUND);
3412   // Compute substr offset
3413   subptr(result, str1);
3414   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3415     shrl(result, 1); // index
3416   }
3417   bind(CLEANUP);
3418   pop(rsp); // restore SP
3419 
3420 } // string_indexof
3421 
3422 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3423                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3424   ShortBranchVerifier sbv(this);
3425   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3426 
3427   int stride = 8;
3428 
3429   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3430         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3431         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3432         FOUND_SEQ_CHAR, DONE_LABEL;
3433 
3434   movptr(result, str1);
3435   if (UseAVX >= 2) {
3436     cmpl(cnt1, stride);
3437     jcc(Assembler::less, SCAN_TO_CHAR);
3438     cmpl(cnt1, 2*stride);
3439     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3440     movdl(vec1, ch);
3441     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3442     vpxor(vec2, vec2);
3443     movl(tmp, cnt1);
3444     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3445     andl(cnt1,0x0000000F);  //tail count (in chars)
3446 
3447     bind(SCAN_TO_16_CHAR_LOOP);
3448     vmovdqu(vec3, Address(result, 0));
3449     vpcmpeqw(vec3, vec3, vec1, 1);
3450     vptest(vec2, vec3);
3451     jcc(Assembler::carryClear, FOUND_CHAR);
3452     addptr(result, 32);
3453     subl(tmp, 2*stride);
3454     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3455     jmp(SCAN_TO_8_CHAR);
3456     bind(SCAN_TO_8_CHAR_INIT);
3457     movdl(vec1, ch);
3458     pshuflw(vec1, vec1, 0x00);
3459     pshufd(vec1, vec1, 0);
3460     pxor(vec2, vec2);
3461   }
3462   bind(SCAN_TO_8_CHAR);
3463   cmpl(cnt1, stride);
3464   jcc(Assembler::less, SCAN_TO_CHAR);
3465   if (UseAVX < 2) {
3466     movdl(vec1, ch);
3467     pshuflw(vec1, vec1, 0x00);
3468     pshufd(vec1, vec1, 0);
3469     pxor(vec2, vec2);
3470   }
3471   movl(tmp, cnt1);
3472   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3473   andl(cnt1,0x00000007);  //tail count (in chars)
3474 
3475   bind(SCAN_TO_8_CHAR_LOOP);
3476   movdqu(vec3, Address(result, 0));
3477   pcmpeqw(vec3, vec1);
3478   ptest(vec2, vec3);
3479   jcc(Assembler::carryClear, FOUND_CHAR);
3480   addptr(result, 16);
3481   subl(tmp, stride);
3482   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3483   bind(SCAN_TO_CHAR);
3484   testl(cnt1, cnt1);
3485   jcc(Assembler::zero, RET_NOT_FOUND);
3486   bind(SCAN_TO_CHAR_LOOP);
3487   load_unsigned_short(tmp, Address(result, 0));
3488   cmpl(ch, tmp);
3489   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3490   addptr(result, 2);
3491   subl(cnt1, 1);
3492   jccb(Assembler::zero, RET_NOT_FOUND);
3493   jmp(SCAN_TO_CHAR_LOOP);
3494 
3495   bind(RET_NOT_FOUND);
3496   movl(result, -1);
3497   jmpb(DONE_LABEL);
3498 
3499   bind(FOUND_CHAR);
3500   if (UseAVX >= 2) {
3501     vpmovmskb(tmp, vec3);
3502   } else {
3503     pmovmskb(tmp, vec3);
3504   }
3505   bsfl(ch, tmp);
3506   addptr(result, ch);
3507 
3508   bind(FOUND_SEQ_CHAR);
3509   subptr(result, str1);
3510   shrl(result, 1);
3511 
3512   bind(DONE_LABEL);
3513 } // string_indexof_char
3514 
3515 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3516                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3517   ShortBranchVerifier sbv(this);
3518   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3519 
3520   int stride = 16;
3521 
3522   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3523         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3524         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3525         FOUND_SEQ_CHAR, DONE_LABEL;
3526 
3527   movptr(result, str1);
3528   if (UseAVX >= 2) {
3529     cmpl(cnt1, stride);
3530     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3531     cmpl(cnt1, stride*2);
3532     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3533     movdl(vec1, ch);
3534     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3535     vpxor(vec2, vec2);
3536     movl(tmp, cnt1);
3537     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3538     andl(cnt1,0x0000001F);  //tail count (in chars)
3539 
3540     bind(SCAN_TO_32_CHAR_LOOP);
3541     vmovdqu(vec3, Address(result, 0));
3542     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3543     vptest(vec2, vec3);
3544     jcc(Assembler::carryClear, FOUND_CHAR);
3545     addptr(result, 32);
3546     subl(tmp, stride*2);
3547     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3548     jmp(SCAN_TO_16_CHAR);
3549 
3550     bind(SCAN_TO_16_CHAR_INIT);
3551     movdl(vec1, ch);
3552     pxor(vec2, vec2);
3553     pshufb(vec1, vec2);
3554   }
3555 
3556   bind(SCAN_TO_16_CHAR);
3557   cmpl(cnt1, stride);
3558   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3559   if (UseAVX < 2) {
3560     movdl(vec1, ch);
3561     pxor(vec2, vec2);
3562     pshufb(vec1, vec2);
3563   }
3564   movl(tmp, cnt1);
3565   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3566   andl(cnt1,0x0000000F);  //tail count (in bytes)
3567 
3568   bind(SCAN_TO_16_CHAR_LOOP);
3569   movdqu(vec3, Address(result, 0));
3570   pcmpeqb(vec3, vec1);
3571   ptest(vec2, vec3);
3572   jcc(Assembler::carryClear, FOUND_CHAR);
3573   addptr(result, 16);
3574   subl(tmp, stride);
3575   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3576 
3577   bind(SCAN_TO_CHAR_INIT);
3578   testl(cnt1, cnt1);
3579   jcc(Assembler::zero, RET_NOT_FOUND);
3580   bind(SCAN_TO_CHAR_LOOP);
3581   load_unsigned_byte(tmp, Address(result, 0));
3582   cmpl(ch, tmp);
3583   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3584   addptr(result, 1);
3585   subl(cnt1, 1);
3586   jccb(Assembler::zero, RET_NOT_FOUND);
3587   jmp(SCAN_TO_CHAR_LOOP);
3588 
3589   bind(RET_NOT_FOUND);
3590   movl(result, -1);
3591   jmpb(DONE_LABEL);
3592 
3593   bind(FOUND_CHAR);
3594   if (UseAVX >= 2) {
3595     vpmovmskb(tmp, vec3);
3596   } else {
3597     pmovmskb(tmp, vec3);
3598   }
3599   bsfl(ch, tmp);
3600   addptr(result, ch);
3601 
3602   bind(FOUND_SEQ_CHAR);
3603   subptr(result, str1);
3604 
3605   bind(DONE_LABEL);
3606 } // stringL_indexof_char
3607 
3608 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3609   switch (eltype) {
3610   case T_BOOLEAN: return sizeof(jboolean);
3611   case T_BYTE:  return sizeof(jbyte);
3612   case T_SHORT: return sizeof(jshort);
3613   case T_CHAR:  return sizeof(jchar);
3614   case T_INT:   return sizeof(jint);
3615   default:
3616     ShouldNotReachHere();
3617     return -1;
3618   }
3619 }
3620 
3621 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3622   switch (eltype) {
3623   // T_BOOLEAN used as surrogate for unsigned byte
3624   case T_BOOLEAN: movzbl(dst, src);   break;
3625   case T_BYTE:    movsbl(dst, src);   break;
3626   case T_SHORT:   movswl(dst, src);   break;
3627   case T_CHAR:    movzwl(dst, src);   break;
3628   case T_INT:     movl(dst, src);     break;
3629   default:
3630     ShouldNotReachHere();
3631   }
3632 }
3633 
3634 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3635   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3636 }
3637 
3638 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3639   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3640 }
3641 
3642 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3643   const int vlen = Assembler::AVX_256bit;
3644   switch (eltype) {
3645   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3646   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3647   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3648   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3649   case T_INT:
3650     // do nothing
3651     break;
3652   default:
3653     ShouldNotReachHere();
3654   }
3655 }
3656 
3657 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3658                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3659                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3660                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3661                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3662                                         BasicType eltype) {
3663   ShortBranchVerifier sbv(this);
3664   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3665   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3666   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3667 
3668   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3669         SHORT_UNROLLED_LOOP_EXIT,
3670         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3671         UNROLLED_VECTOR_LOOP_BEGIN,
3672         END;
3673   switch (eltype) {
3674   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3675   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3676   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3677   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3678   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3679   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3680   }
3681 
3682   // For "renaming" for readibility of the code
3683   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3684                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3685                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3686 
3687   const int elsize = arrays_hashcode_elsize(eltype);
3688 
3689   /*
3690     if (cnt1 >= 2) {
3691       if (cnt1 >= 32) {
3692         UNROLLED VECTOR LOOP
3693       }
3694       UNROLLED SCALAR LOOP
3695     }
3696     SINGLE SCALAR
3697    */
3698 
3699   cmpl(cnt1, 32);
3700   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3701 
3702   // cnt1 >= 32 && generate_vectorized_loop
3703   xorl(index, index);
3704 
3705   // vresult = IntVector.zero(I256);
3706   for (int idx = 0; idx < 4; idx++) {
3707     vpxor(vresult[idx], vresult[idx]);
3708   }
3709   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3710   Register bound = tmp2;
3711   Register next = tmp3;
3712   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3713   movl(next, Address(tmp2, 0));
3714   movdl(vnext, next);
3715   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3716 
3717   // index = 0;
3718   // bound = cnt1 & ~(32 - 1);
3719   movl(bound, cnt1);
3720   andl(bound, ~(32 - 1));
3721   // for (; index < bound; index += 32) {
3722   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3723   // result *= next;
3724   imull(result, next);
3725   // loop fission to upfront the cost of fetching from memory, OOO execution
3726   // can then hopefully do a better job of prefetching
3727   for (int idx = 0; idx < 4; idx++) {
3728     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3729   }
3730   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3731   for (int idx = 0; idx < 4; idx++) {
3732     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3733     arrays_hashcode_elvcast(vtmp[idx], eltype);
3734     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3735   }
3736   // index += 32;
3737   addl(index, 32);
3738   // index < bound;
3739   cmpl(index, bound);
3740   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3741   // }
3742 
3743   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3744   subl(cnt1, bound);
3745   // release bound
3746 
3747   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3748   for (int idx = 0; idx < 4; idx++) {
3749     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3750     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3751     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3752   }
3753   // result += vresult.reduceLanes(ADD);
3754   for (int idx = 0; idx < 4; idx++) {
3755     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3756   }
3757 
3758   // } else if (cnt1 < 32) {
3759 
3760   bind(SHORT_UNROLLED_BEGIN);
3761   // int i = 1;
3762   movl(index, 1);
3763   cmpl(index, cnt1);
3764   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3765 
3766   // for (; i < cnt1 ; i += 2) {
3767   bind(SHORT_UNROLLED_LOOP_BEGIN);
3768   movl(tmp3, 961);
3769   imull(result, tmp3);
3770   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3771   movl(tmp3, tmp2);
3772   shll(tmp3, 5);
3773   subl(tmp3, tmp2);
3774   addl(result, tmp3);
3775   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3776   addl(result, tmp3);
3777   addl(index, 2);
3778   cmpl(index, cnt1);
3779   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3780 
3781   // }
3782   // if (i >= cnt1) {
3783   bind(SHORT_UNROLLED_LOOP_EXIT);
3784   jccb(Assembler::greater, END);
3785   movl(tmp2, result);
3786   shll(result, 5);
3787   subl(result, tmp2);
3788   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3789   addl(result, tmp3);
3790   // }
3791   bind(END);
3792 
3793   BLOCK_COMMENT("} // arrays_hashcode");
3794 
3795 } // arrays_hashcode
3796 
3797 // helper function for string_compare
3798 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3799                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3800                                            Address::ScaleFactor scale2, Register index, int ae) {
3801   if (ae == StrIntrinsicNode::LL) {
3802     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3803     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3804   } else if (ae == StrIntrinsicNode::UU) {
3805     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3806     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3807   } else {
3808     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3809     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3810   }
3811 }
3812 
3813 // Compare strings, used for char[] and byte[].
3814 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3815                                        Register cnt1, Register cnt2, Register result,
3816                                        XMMRegister vec1, int ae, KRegister mask) {
3817   ShortBranchVerifier sbv(this);
3818   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3819   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only AVX3
3820   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3821   int stride2x2 = 0x40;
3822   Address::ScaleFactor scale = Address::no_scale;
3823   Address::ScaleFactor scale1 = Address::no_scale;
3824   Address::ScaleFactor scale2 = Address::no_scale;
3825 
3826   if (ae != StrIntrinsicNode::LL) {
3827     stride2x2 = 0x20;
3828   }
3829 
3830   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3831     shrl(cnt2, 1);
3832   }
3833   // Compute the minimum of the string lengths and the
3834   // difference of the string lengths (stack).
3835   // Do the conditional move stuff
3836   movl(result, cnt1);
3837   subl(cnt1, cnt2);
3838   push(cnt1);
3839   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3840 
3841   // Is the minimum length zero?
3842   testl(cnt2, cnt2);
3843   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3844   if (ae == StrIntrinsicNode::LL) {
3845     // Load first bytes
3846     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3847     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3848   } else if (ae == StrIntrinsicNode::UU) {
3849     // Load first characters
3850     load_unsigned_short(result, Address(str1, 0));
3851     load_unsigned_short(cnt1, Address(str2, 0));
3852   } else {
3853     load_unsigned_byte(result, Address(str1, 0));
3854     load_unsigned_short(cnt1, Address(str2, 0));
3855   }
3856   subl(result, cnt1);
3857   jcc(Assembler::notZero,  POP_LABEL);
3858 
3859   if (ae == StrIntrinsicNode::UU) {
3860     // Divide length by 2 to get number of chars
3861     shrl(cnt2, 1);
3862   }
3863   cmpl(cnt2, 1);
3864   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3865 
3866   // Check if the strings start at the same location and setup scale and stride
3867   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3868     cmpptr(str1, str2);
3869     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3870     if (ae == StrIntrinsicNode::LL) {
3871       scale = Address::times_1;
3872       stride = 16;
3873     } else {
3874       scale = Address::times_2;
3875       stride = 8;
3876     }
3877   } else {
3878     scale1 = Address::times_1;
3879     scale2 = Address::times_2;
3880     // scale not used
3881     stride = 8;
3882   }
3883 
3884   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3885     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3886     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3887     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3888     Label COMPARE_TAIL_LONG;
3889     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only AVX3
3890 
3891     int pcmpmask = 0x19;
3892     if (ae == StrIntrinsicNode::LL) {
3893       pcmpmask &= ~0x01;
3894     }
3895 
3896     // Setup to compare 16-chars (32-bytes) vectors,
3897     // start from first character again because it has aligned address.
3898     if (ae == StrIntrinsicNode::LL) {
3899       stride2 = 32;
3900     } else {
3901       stride2 = 16;
3902     }
3903     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3904       adr_stride = stride << scale;
3905     } else {
3906       adr_stride1 = 8;  //stride << scale1;
3907       adr_stride2 = 16; //stride << scale2;
3908     }
3909 
3910     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3911     // rax and rdx are used by pcmpestri as elements counters
3912     movl(result, cnt2);
3913     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3914     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3915 
3916     // fast path : compare first 2 8-char vectors.
3917     bind(COMPARE_16_CHARS);
3918     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3919       movdqu(vec1, Address(str1, 0));
3920     } else {
3921       pmovzxbw(vec1, Address(str1, 0));
3922     }
3923     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3924     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3925 
3926     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3927       movdqu(vec1, Address(str1, adr_stride));
3928       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3929     } else {
3930       pmovzxbw(vec1, Address(str1, adr_stride1));
3931       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3932     }
3933     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3934     addl(cnt1, stride);
3935 
3936     // Compare the characters at index in cnt1
3937     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3938     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3939     subl(result, cnt2);
3940     jmp(POP_LABEL);
3941 
3942     // Setup the registers to start vector comparison loop
3943     bind(COMPARE_WIDE_VECTORS);
3944     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3945       lea(str1, Address(str1, result, scale));
3946       lea(str2, Address(str2, result, scale));
3947     } else {
3948       lea(str1, Address(str1, result, scale1));
3949       lea(str2, Address(str2, result, scale2));
3950     }
3951     subl(result, stride2);
3952     subl(cnt2, stride2);
3953     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3954     negptr(result);
3955 
3956     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3957     bind(COMPARE_WIDE_VECTORS_LOOP);
3958 
3959     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3960       cmpl(cnt2, stride2x2);
3961       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3962       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3963       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3964 
3965       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3966       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3967         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3968         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3969       } else {
3970         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3971         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3972       }
3973       kortestql(mask, mask);
3974       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3975       addptr(result, stride2x2);  // update since we already compared at this addr
3976       subl(cnt2, stride2x2);      // and sub the size too
3977       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3978 
3979       vpxor(vec1, vec1);
3980       jmpb(COMPARE_WIDE_TAIL);
3981     }//if (VM_Version::supports_avx512vlbw())
3982 
3983     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3984     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3985       vmovdqu(vec1, Address(str1, result, scale));
3986       vpxor(vec1, Address(str2, result, scale));
3987     } else {
3988       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3989       vpxor(vec1, Address(str2, result, scale2));
3990     }
3991     vptest(vec1, vec1);
3992     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3993     addptr(result, stride2);
3994     subl(cnt2, stride2);
3995     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3996     // clean upper bits of YMM registers
3997     vpxor(vec1, vec1);
3998 
3999     // compare wide vectors tail
4000     bind(COMPARE_WIDE_TAIL);
4001     testptr(result, result);
4002     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
4003 
4004     movl(result, stride2);
4005     movl(cnt2, result);
4006     negptr(result);
4007     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4008 
4009     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
4010     bind(VECTOR_NOT_EQUAL);
4011     // clean upper bits of YMM registers
4012     vpxor(vec1, vec1);
4013     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4014       lea(str1, Address(str1, result, scale));
4015       lea(str2, Address(str2, result, scale));
4016     } else {
4017       lea(str1, Address(str1, result, scale1));
4018       lea(str2, Address(str2, result, scale2));
4019     }
4020     jmp(COMPARE_16_CHARS);
4021 
4022     // Compare tail chars, length between 1 to 15 chars
4023     bind(COMPARE_TAIL_LONG);
4024     movl(cnt2, result);
4025     cmpl(cnt2, stride);
4026     jcc(Assembler::less, COMPARE_SMALL_STR);
4027 
4028     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4029       movdqu(vec1, Address(str1, 0));
4030     } else {
4031       pmovzxbw(vec1, Address(str1, 0));
4032     }
4033     pcmpestri(vec1, Address(str2, 0), pcmpmask);
4034     jcc(Assembler::below, COMPARE_INDEX_CHAR);
4035     subptr(cnt2, stride);
4036     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
4037     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4038       lea(str1, Address(str1, result, scale));
4039       lea(str2, Address(str2, result, scale));
4040     } else {
4041       lea(str1, Address(str1, result, scale1));
4042       lea(str2, Address(str2, result, scale2));
4043     }
4044     negptr(cnt2);
4045     jmpb(WHILE_HEAD_LABEL);
4046 
4047     bind(COMPARE_SMALL_STR);
4048   } else if (UseSSE42Intrinsics) {
4049     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
4050     int pcmpmask = 0x19;
4051     // Setup to compare 8-char (16-byte) vectors,
4052     // start from first character again because it has aligned address.
4053     movl(result, cnt2);
4054     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
4055     if (ae == StrIntrinsicNode::LL) {
4056       pcmpmask &= ~0x01;
4057     }
4058     jcc(Assembler::zero, COMPARE_TAIL);
4059     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4060       lea(str1, Address(str1, result, scale));
4061       lea(str2, Address(str2, result, scale));
4062     } else {
4063       lea(str1, Address(str1, result, scale1));
4064       lea(str2, Address(str2, result, scale2));
4065     }
4066     negptr(result);
4067 
4068     // pcmpestri
4069     //   inputs:
4070     //     vec1- substring
4071     //     rax - negative string length (elements count)
4072     //     mem - scanned string
4073     //     rdx - string length (elements count)
4074     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
4075     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
4076     //   outputs:
4077     //     rcx - first mismatched element index
4078     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
4079 
4080     bind(COMPARE_WIDE_VECTORS);
4081     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4082       movdqu(vec1, Address(str1, result, scale));
4083       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
4084     } else {
4085       pmovzxbw(vec1, Address(str1, result, scale1));
4086       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
4087     }
4088     // After pcmpestri cnt1(rcx) contains mismatched element index
4089 
4090     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
4091     addptr(result, stride);
4092     subptr(cnt2, stride);
4093     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4094 
4095     // compare wide vectors tail
4096     testptr(result, result);
4097     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
4098 
4099     movl(cnt2, stride);
4100     movl(result, stride);
4101     negptr(result);
4102     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4103       movdqu(vec1, Address(str1, result, scale));
4104       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
4105     } else {
4106       pmovzxbw(vec1, Address(str1, result, scale1));
4107       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
4108     }
4109     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
4110 
4111     // Mismatched characters in the vectors
4112     bind(VECTOR_NOT_EQUAL);
4113     addptr(cnt1, result);
4114     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
4115     subl(result, cnt2);
4116     jmpb(POP_LABEL);
4117 
4118     bind(COMPARE_TAIL); // limit is zero
4119     movl(cnt2, result);
4120     // Fallthru to tail compare
4121   }
4122   // Shift str2 and str1 to the end of the arrays, negate min
4123   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4124     lea(str1, Address(str1, cnt2, scale));
4125     lea(str2, Address(str2, cnt2, scale));
4126   } else {
4127     lea(str1, Address(str1, cnt2, scale1));
4128     lea(str2, Address(str2, cnt2, scale2));
4129   }
4130   decrementl(cnt2);  // first character was compared already
4131   negptr(cnt2);
4132 
4133   // Compare the rest of the elements
4134   bind(WHILE_HEAD_LABEL);
4135   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
4136   subl(result, cnt1);
4137   jccb(Assembler::notZero, POP_LABEL);
4138   increment(cnt2);
4139   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
4140 
4141   // Strings are equal up to min length.  Return the length difference.
4142   bind(LENGTH_DIFF_LABEL);
4143   pop(result);
4144   if (ae == StrIntrinsicNode::UU) {
4145     // Divide diff by 2 to get number of chars
4146     sarl(result, 1);
4147   }
4148   jmpb(DONE_LABEL);
4149 
4150   if (VM_Version::supports_avx512vlbw()) {
4151 
4152     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
4153 
4154     kmovql(cnt1, mask);
4155     notq(cnt1);
4156     bsfq(cnt2, cnt1);
4157     if (ae != StrIntrinsicNode::LL) {
4158       // Divide diff by 2 to get number of chars
4159       sarl(cnt2, 1);
4160     }
4161     addq(result, cnt2);
4162     if (ae == StrIntrinsicNode::LL) {
4163       load_unsigned_byte(cnt1, Address(str2, result));
4164       load_unsigned_byte(result, Address(str1, result));
4165     } else if (ae == StrIntrinsicNode::UU) {
4166       load_unsigned_short(cnt1, Address(str2, result, scale));
4167       load_unsigned_short(result, Address(str1, result, scale));
4168     } else {
4169       load_unsigned_short(cnt1, Address(str2, result, scale2));
4170       load_unsigned_byte(result, Address(str1, result, scale1));
4171     }
4172     subl(result, cnt1);
4173     jmpb(POP_LABEL);
4174   }//if (VM_Version::supports_avx512vlbw())
4175 
4176   // Discard the stored length difference
4177   bind(POP_LABEL);
4178   pop(cnt1);
4179 
4180   // That's it
4181   bind(DONE_LABEL);
4182   if(ae == StrIntrinsicNode::UL) {
4183     negl(result);
4184   }
4185 
4186 }
4187 
4188 // Search for Non-ASCII character (Negative byte value) in a byte array,
4189 // return the index of the first such character, otherwise the length
4190 // of the array segment searched.
4191 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4192 //   @IntrinsicCandidate
4193 //   public static int countPositives(byte[] ba, int off, int len) {
4194 //     for (int i = off; i < off + len; i++) {
4195 //       if (ba[i] < 0) {
4196 //         return i - off;
4197 //       }
4198 //     }
4199 //     return len;
4200 //   }
4201 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4202   Register result, Register tmp1,
4203   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4204   // rsi: byte array
4205   // rcx: len
4206   // rax: result
4207   ShortBranchVerifier sbv(this);
4208   assert_different_registers(ary1, len, result, tmp1);
4209   assert_different_registers(vec1, vec2);
4210   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4211 
4212   movl(result, len); // copy
4213   // len == 0
4214   testl(len, len);
4215   jcc(Assembler::zero, DONE);
4216 
4217   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4218     VM_Version::supports_avx512vlbw() &&
4219     VM_Version::supports_bmi2()) {
4220 
4221     Label test_64_loop, test_tail, BREAK_LOOP;
4222     movl(tmp1, len);
4223     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4224 
4225     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4226     andl(len,  0xffffffc0); // vector count (in chars)
4227     jccb(Assembler::zero, test_tail);
4228 
4229     lea(ary1, Address(ary1, len, Address::times_1));
4230     negptr(len);
4231 
4232     bind(test_64_loop);
4233     // Check whether our 64 elements of size byte contain negatives
4234     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4235     kortestql(mask1, mask1);
4236     jcc(Assembler::notZero, BREAK_LOOP);
4237 
4238     addptr(len, 64);
4239     jccb(Assembler::notZero, test_64_loop);
4240 
4241     bind(test_tail);
4242     // bail out when there is nothing to be done
4243     testl(tmp1, -1);
4244     jcc(Assembler::zero, DONE);
4245 
4246 
4247     // check the tail for absense of negatives
4248     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4249     {
4250       Register tmp3_aliased = len;
4251       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4252       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4253       notq(tmp3_aliased);
4254       kmovql(mask2, tmp3_aliased);
4255     }
4256 
4257     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4258     ktestq(mask1, mask2);
4259     jcc(Assembler::zero, DONE);
4260 
4261     // do a full check for negative registers in the tail
4262     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4263                      // ary1 already pointing to the right place
4264     jmpb(TAIL_START);
4265 
4266     bind(BREAK_LOOP);
4267     // At least one byte in the last 64 byte block was negative.
4268     // Set up to look at the last 64 bytes as if they were a tail
4269     lea(ary1, Address(ary1, len, Address::times_1));
4270     addptr(result, len);
4271     // Ignore the very last byte: if all others are positive,
4272     // it must be negative, so we can skip right to the 2+1 byte
4273     // end comparison at this point
4274     orl(result, 63);
4275     movl(len, 63);
4276     // Fallthru to tail compare
4277   } else {
4278 
4279     if (UseAVX >= 2) {
4280       // With AVX2, use 32-byte vector compare
4281       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4282 
4283       // Compare 32-byte vectors
4284       testl(len, 0xffffffe0);   // vector count (in bytes)
4285       jccb(Assembler::zero, TAIL_START);
4286 
4287       andl(len, 0xffffffe0);
4288       lea(ary1, Address(ary1, len, Address::times_1));
4289       negptr(len);
4290 
4291       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4292       movdl(vec2, tmp1);
4293       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4294 
4295       bind(COMPARE_WIDE_VECTORS);
4296       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4297       vptest(vec1, vec2);
4298       jccb(Assembler::notZero, BREAK_LOOP);
4299       addptr(len, 32);
4300       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4301 
4302       testl(result, 0x0000001f);   // any bytes remaining?
4303       jcc(Assembler::zero, DONE);
4304 
4305       // Quick test using the already prepared vector mask
4306       movl(len, result);
4307       andl(len, 0x0000001f);
4308       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4309       vptest(vec1, vec2);
4310       jcc(Assembler::zero, DONE);
4311       // There are zeros, jump to the tail to determine exactly where
4312       jmpb(TAIL_START);
4313 
4314       bind(BREAK_LOOP);
4315       // At least one byte in the last 32-byte vector is negative.
4316       // Set up to look at the last 32 bytes as if they were a tail
4317       lea(ary1, Address(ary1, len, Address::times_1));
4318       addptr(result, len);
4319       // Ignore the very last byte: if all others are positive,
4320       // it must be negative, so we can skip right to the 2+1 byte
4321       // end comparison at this point
4322       orl(result, 31);
4323       movl(len, 31);
4324       // Fallthru to tail compare
4325     } else if (UseSSE42Intrinsics) {
4326       // With SSE4.2, use double quad vector compare
4327       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4328 
4329       // Compare 16-byte vectors
4330       testl(len, 0xfffffff0);   // vector count (in bytes)
4331       jcc(Assembler::zero, TAIL_START);
4332 
4333       andl(len, 0xfffffff0);
4334       lea(ary1, Address(ary1, len, Address::times_1));
4335       negptr(len);
4336 
4337       movl(tmp1, 0x80808080);
4338       movdl(vec2, tmp1);
4339       pshufd(vec2, vec2, 0);
4340 
4341       bind(COMPARE_WIDE_VECTORS);
4342       movdqu(vec1, Address(ary1, len, Address::times_1));
4343       ptest(vec1, vec2);
4344       jccb(Assembler::notZero, BREAK_LOOP);
4345       addptr(len, 16);
4346       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4347 
4348       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4349       jcc(Assembler::zero, DONE);
4350 
4351       // Quick test using the already prepared vector mask
4352       movl(len, result);
4353       andl(len, 0x0000000f);   // tail count (in bytes)
4354       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4355       ptest(vec1, vec2);
4356       jcc(Assembler::zero, DONE);
4357       jmpb(TAIL_START);
4358 
4359       bind(BREAK_LOOP);
4360       // At least one byte in the last 16-byte vector is negative.
4361       // Set up and look at the last 16 bytes as if they were a tail
4362       lea(ary1, Address(ary1, len, Address::times_1));
4363       addptr(result, len);
4364       // Ignore the very last byte: if all others are positive,
4365       // it must be negative, so we can skip right to the 2+1 byte
4366       // end comparison at this point
4367       orl(result, 15);
4368       movl(len, 15);
4369       // Fallthru to tail compare
4370     }
4371   }
4372 
4373   bind(TAIL_START);
4374   // Compare 4-byte vectors
4375   andl(len, 0xfffffffc); // vector count (in bytes)
4376   jccb(Assembler::zero, COMPARE_CHAR);
4377 
4378   lea(ary1, Address(ary1, len, Address::times_1));
4379   negptr(len);
4380 
4381   bind(COMPARE_VECTORS);
4382   movl(tmp1, Address(ary1, len, Address::times_1));
4383   andl(tmp1, 0x80808080);
4384   jccb(Assembler::notZero, TAIL_ADJUST);
4385   addptr(len, 4);
4386   jccb(Assembler::notZero, COMPARE_VECTORS);
4387 
4388   // Compare trailing char (final 2-3 bytes), if any
4389   bind(COMPARE_CHAR);
4390 
4391   testl(result, 0x2);   // tail  char
4392   jccb(Assembler::zero, COMPARE_BYTE);
4393   load_unsigned_short(tmp1, Address(ary1, 0));
4394   andl(tmp1, 0x00008080);
4395   jccb(Assembler::notZero, CHAR_ADJUST);
4396   lea(ary1, Address(ary1, 2));
4397 
4398   bind(COMPARE_BYTE);
4399   testl(result, 0x1);   // tail  byte
4400   jccb(Assembler::zero, DONE);
4401   load_unsigned_byte(tmp1, Address(ary1, 0));
4402   testl(tmp1, 0x00000080);
4403   jccb(Assembler::zero, DONE);
4404   subptr(result, 1);
4405   jmpb(DONE);
4406 
4407   bind(TAIL_ADJUST);
4408   // there are negative bits in the last 4 byte block.
4409   // Adjust result and check the next three bytes
4410   addptr(result, len);
4411   orl(result, 3);
4412   lea(ary1, Address(ary1, len, Address::times_1));
4413   jmpb(COMPARE_CHAR);
4414 
4415   bind(CHAR_ADJUST);
4416   // We are looking at a char + optional byte tail, and found that one
4417   // of the bytes in the char is negative. Adjust the result, check the
4418   // first byte and readjust if needed.
4419   andl(result, 0xfffffffc);
4420   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4421   jccb(Assembler::notZero, DONE);
4422   addptr(result, 1);
4423 
4424   // That's it
4425   bind(DONE);
4426   if (UseAVX >= 2) {
4427     // clean upper bits of YMM registers
4428     vpxor(vec1, vec1);
4429     vpxor(vec2, vec2);
4430   }
4431 }
4432 
4433 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4434 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4435                                       Register limit, Register result, Register chr,
4436                                       XMMRegister vec1, XMMRegister vec2, bool is_char,
4437                                       KRegister mask, bool expand_ary2) {
4438   // for expand_ary2, limit is the (smaller) size of the second array.
4439   ShortBranchVerifier sbv(this);
4440   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4441 
4442   assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4443          "Expansion only implemented for AVX2");
4444 
4445   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4446   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4447 
4448   Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4449   int scaleIncr = expand_ary2 ? 8 : 16;
4450 
4451   if (is_array_equ) {
4452     // Check the input args
4453     cmpoop(ary1, ary2);
4454     jcc(Assembler::equal, TRUE_LABEL);
4455 
4456     // Need additional checks for arrays_equals.
4457     testptr(ary1, ary1);
4458     jcc(Assembler::zero, FALSE_LABEL);
4459     testptr(ary2, ary2);
4460     jcc(Assembler::zero, FALSE_LABEL);
4461 
4462     // Check the lengths
4463     movl(limit, Address(ary1, length_offset));
4464     cmpl(limit, Address(ary2, length_offset));
4465     jcc(Assembler::notEqual, FALSE_LABEL);
4466   }
4467 
4468   // count == 0
4469   testl(limit, limit);
4470   jcc(Assembler::zero, TRUE_LABEL);
4471 
4472   if (is_array_equ) {
4473     // Load array address
4474     lea(ary1, Address(ary1, base_offset));
4475     lea(ary2, Address(ary2, base_offset));
4476   }
4477 
4478   if (is_array_equ && is_char) {
4479     // arrays_equals when used for char[].
4480     shll(limit, 1);      // byte count != 0
4481   }
4482   movl(result, limit); // copy
4483 
4484   if (UseAVX >= 2) {
4485     // With AVX2, use 32-byte vector compare
4486     Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4487 
4488     // Compare 32-byte vectors
4489     if (expand_ary2) {
4490       andl(result, 0x0000000f);  //   tail count (in bytes)
4491       andl(limit, 0xfffffff0);   // vector count (in bytes)
4492       jcc(Assembler::zero, COMPARE_TAIL);
4493     } else {
4494       andl(result, 0x0000001f);  //   tail count (in bytes)
4495       andl(limit, 0xffffffe0);   // vector count (in bytes)
4496       jcc(Assembler::zero, COMPARE_TAIL_16);
4497     }
4498 
4499     lea(ary1, Address(ary1, limit, scaleFactor));
4500     lea(ary2, Address(ary2, limit, Address::times_1));
4501     negptr(limit);
4502 
4503     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4504       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4505 
4506       cmpl(limit, -64);
4507       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4508 
4509       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4510 
4511       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4512       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4513       kortestql(mask, mask);
4514       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4515       addptr(limit, 64);  // update since we already compared at this addr
4516       cmpl(limit, -64);
4517       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4518 
4519       // At this point we may still need to compare -limit+result bytes.
4520       // We could execute the next two instruction and just continue via non-wide path:
4521       //  cmpl(limit, 0);
4522       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4523       // But since we stopped at the points ary{1,2}+limit which are
4524       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4525       // (|limit| <= 32 and result < 32),
4526       // we may just compare the last 64 bytes.
4527       //
4528       addptr(result, -64);   // it is safe, bc we just came from this area
4529       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4530       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4531       kortestql(mask, mask);
4532       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4533 
4534       jmp(TRUE_LABEL);
4535 
4536       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4537 
4538     }//if (VM_Version::supports_avx512vlbw())
4539 
4540     bind(COMPARE_WIDE_VECTORS);
4541     vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4542     if (expand_ary2) {
4543       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4544     } else {
4545       vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4546     }
4547     vpxor(vec1, vec2);
4548 
4549     vptest(vec1, vec1);
4550     jcc(Assembler::notZero, FALSE_LABEL);
4551     addptr(limit, scaleIncr * 2);
4552     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4553 
4554     testl(result, result);
4555     jcc(Assembler::zero, TRUE_LABEL);
4556 
4557     vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4558     if (expand_ary2) {
4559       vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4560     } else {
4561       vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4562     }
4563     vpxor(vec1, vec2);
4564 
4565     vptest(vec1, vec1);
4566     jcc(Assembler::notZero, FALSE_LABEL);
4567     jmp(TRUE_LABEL);
4568 
4569     bind(COMPARE_TAIL_16); // limit is zero
4570     movl(limit, result);
4571 
4572     // Compare 16-byte chunks
4573     andl(result, 0x0000000f);  //   tail count (in bytes)
4574     andl(limit, 0xfffffff0);   // vector count (in bytes)
4575     jcc(Assembler::zero, COMPARE_TAIL);
4576 
4577     lea(ary1, Address(ary1, limit, scaleFactor));
4578     lea(ary2, Address(ary2, limit, Address::times_1));
4579     negptr(limit);
4580 
4581     bind(COMPARE_WIDE_VECTORS_16);
4582     movdqu(vec1, Address(ary1, limit, scaleFactor));
4583     if (expand_ary2) {
4584       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4585     } else {
4586       movdqu(vec2, Address(ary2, limit, Address::times_1));
4587     }
4588     pxor(vec1, vec2);
4589 
4590     ptest(vec1, vec1);
4591     jcc(Assembler::notZero, FALSE_LABEL);
4592     addptr(limit, scaleIncr);
4593     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4594 
4595     bind(COMPARE_TAIL); // limit is zero
4596     movl(limit, result);
4597     // Fallthru to tail compare
4598   } else if (UseSSE42Intrinsics) {
4599     // With SSE4.2, use double quad vector compare
4600     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4601 
4602     // Compare 16-byte vectors
4603     andl(result, 0x0000000f);  //   tail count (in bytes)
4604     andl(limit, 0xfffffff0);   // vector count (in bytes)
4605     jcc(Assembler::zero, COMPARE_TAIL);
4606 
4607     lea(ary1, Address(ary1, limit, Address::times_1));
4608     lea(ary2, Address(ary2, limit, Address::times_1));
4609     negptr(limit);
4610 
4611     bind(COMPARE_WIDE_VECTORS);
4612     movdqu(vec1, Address(ary1, limit, Address::times_1));
4613     movdqu(vec2, Address(ary2, limit, Address::times_1));
4614     pxor(vec1, vec2);
4615 
4616     ptest(vec1, vec1);
4617     jcc(Assembler::notZero, FALSE_LABEL);
4618     addptr(limit, 16);
4619     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4620 
4621     testl(result, result);
4622     jcc(Assembler::zero, TRUE_LABEL);
4623 
4624     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4625     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4626     pxor(vec1, vec2);
4627 
4628     ptest(vec1, vec1);
4629     jccb(Assembler::notZero, FALSE_LABEL);
4630     jmpb(TRUE_LABEL);
4631 
4632     bind(COMPARE_TAIL); // limit is zero
4633     movl(limit, result);
4634     // Fallthru to tail compare
4635   }
4636 
4637   // Compare 4-byte vectors
4638   if (expand_ary2) {
4639     testl(result, result);
4640     jccb(Assembler::zero, TRUE_LABEL);
4641   } else {
4642     andl(limit, 0xfffffffc); // vector count (in bytes)
4643     jccb(Assembler::zero, COMPARE_CHAR);
4644   }
4645 
4646   lea(ary1, Address(ary1, limit, scaleFactor));
4647   lea(ary2, Address(ary2, limit, Address::times_1));
4648   negptr(limit);
4649 
4650   bind(COMPARE_VECTORS);
4651   if (expand_ary2) {
4652     // There are no "vector" operations for bytes to shorts
4653     movzbl(chr, Address(ary2, limit, Address::times_1));
4654     cmpw(Address(ary1, limit, Address::times_2), chr);
4655     jccb(Assembler::notEqual, FALSE_LABEL);
4656     addptr(limit, 1);
4657     jcc(Assembler::notZero, COMPARE_VECTORS);
4658     jmp(TRUE_LABEL);
4659   } else {
4660     movl(chr, Address(ary1, limit, Address::times_1));
4661     cmpl(chr, Address(ary2, limit, Address::times_1));
4662     jccb(Assembler::notEqual, FALSE_LABEL);
4663     addptr(limit, 4);
4664     jcc(Assembler::notZero, COMPARE_VECTORS);
4665   }
4666 
4667   // Compare trailing char (final 2 bytes), if any
4668   bind(COMPARE_CHAR);
4669   testl(result, 0x2);   // tail  char
4670   jccb(Assembler::zero, COMPARE_BYTE);
4671   load_unsigned_short(chr, Address(ary1, 0));
4672   load_unsigned_short(limit, Address(ary2, 0));
4673   cmpl(chr, limit);
4674   jccb(Assembler::notEqual, FALSE_LABEL);
4675 
4676   if (is_array_equ && is_char) {
4677     bind(COMPARE_BYTE);
4678   } else {
4679     lea(ary1, Address(ary1, 2));
4680     lea(ary2, Address(ary2, 2));
4681 
4682     bind(COMPARE_BYTE);
4683     testl(result, 0x1);   // tail  byte
4684     jccb(Assembler::zero, TRUE_LABEL);
4685     load_unsigned_byte(chr, Address(ary1, 0));
4686     load_unsigned_byte(limit, Address(ary2, 0));
4687     cmpl(chr, limit);
4688     jccb(Assembler::notEqual, FALSE_LABEL);
4689   }
4690   bind(TRUE_LABEL);
4691   movl(result, 1);   // return true
4692   jmpb(DONE);
4693 
4694   bind(FALSE_LABEL);
4695   xorl(result, result); // return false
4696 
4697   // That's it
4698   bind(DONE);
4699   if (UseAVX >= 2) {
4700     // clean upper bits of YMM registers
4701     vpxor(vec1, vec1);
4702     vpxor(vec2, vec2);
4703   }
4704 }
4705 
4706 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4707 #define __ masm.
4708   Register dst = stub.data<0>();
4709   XMMRegister src = stub.data<1>();
4710   address target = stub.data<2>();
4711   __ bind(stub.entry());
4712   __ subptr(rsp, 8);
4713   __ movdbl(Address(rsp), src);
4714   __ call(RuntimeAddress(target));
4715   // APX REX2 encoding for pop(dst) increases the stub size by 1 byte.
4716   __ pop(dst);
4717   __ jmp(stub.continuation());
4718 #undef __
4719 }
4720 
4721 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4722   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4723   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4724 
4725   address slowpath_target;
4726   if (dst_bt == T_INT) {
4727     if (src_bt == T_FLOAT) {
4728       cvttss2sil(dst, src);
4729       cmpl(dst, 0x80000000);
4730       slowpath_target = StubRoutines::x86::f2i_fixup();
4731     } else {
4732       cvttsd2sil(dst, src);
4733       cmpl(dst, 0x80000000);
4734       slowpath_target = StubRoutines::x86::d2i_fixup();
4735     }
4736   } else {
4737     if (src_bt == T_FLOAT) {
4738       cvttss2siq(dst, src);
4739       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4740       slowpath_target = StubRoutines::x86::f2l_fixup();
4741     } else {
4742       cvttsd2siq(dst, src);
4743       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4744       slowpath_target = StubRoutines::x86::d2l_fixup();
4745     }
4746   }
4747 
4748   // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte.
4749   int max_size = 23 + (UseAPX ? 1 : 0);
4750   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath);
4751   jcc(Assembler::equal, stub->entry());
4752   bind(stub->continuation());
4753 }
4754 
4755 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4756                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4757   switch(ideal_opc) {
4758     case Op_LShiftVS:
4759       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4760     case Op_LShiftVI:
4761       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4762     case Op_LShiftVL:
4763       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4764     case Op_RShiftVS:
4765       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4766     case Op_RShiftVI:
4767       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4768     case Op_RShiftVL:
4769       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4770     case Op_URShiftVS:
4771       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4772     case Op_URShiftVI:
4773       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4774     case Op_URShiftVL:
4775       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4776     case Op_RotateRightV:
4777       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4778     case Op_RotateLeftV:
4779       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4780     default:
4781       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4782       break;
4783   }
4784 }
4785 
4786 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4787                                                XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4788   if (is_unsigned) {
4789     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4790   } else {
4791     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4792   }
4793 }
4794 
4795 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4796                                                       XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4797   switch (elem_bt) {
4798     case T_BYTE:
4799       if (ideal_opc == Op_SaturatingAddV) {
4800         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4801       } else {
4802         assert(ideal_opc == Op_SaturatingSubV, "");
4803         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4804       }
4805       break;
4806     case T_SHORT:
4807       if (ideal_opc == Op_SaturatingAddV) {
4808         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4809       } else {
4810         assert(ideal_opc == Op_SaturatingSubV, "");
4811         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4812       }
4813       break;
4814     default:
4815       fatal("Unsupported type %s", type2name(elem_bt));
4816       break;
4817   }
4818 }
4819 
4820 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4821                                                         XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4822   switch (elem_bt) {
4823     case T_BYTE:
4824       if (ideal_opc == Op_SaturatingAddV) {
4825         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4826       } else {
4827         assert(ideal_opc == Op_SaturatingSubV, "");
4828         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4829       }
4830       break;
4831     case T_SHORT:
4832       if (ideal_opc == Op_SaturatingAddV) {
4833         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4834       } else {
4835         assert(ideal_opc == Op_SaturatingSubV, "");
4836         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4837       }
4838       break;
4839     default:
4840       fatal("Unsupported type %s", type2name(elem_bt));
4841       break;
4842   }
4843 }
4844 
4845 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4846                                                Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4847   if (is_unsigned) {
4848     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4849   } else {
4850     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4851   }
4852 }
4853 
4854 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4855                                                       XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4856   switch (elem_bt) {
4857     case T_BYTE:
4858       if (ideal_opc == Op_SaturatingAddV) {
4859         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4860       } else {
4861         assert(ideal_opc == Op_SaturatingSubV, "");
4862         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4863       }
4864       break;
4865     case T_SHORT:
4866       if (ideal_opc == Op_SaturatingAddV) {
4867         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4868       } else {
4869         assert(ideal_opc == Op_SaturatingSubV, "");
4870         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4871       }
4872       break;
4873     default:
4874       fatal("Unsupported type %s", type2name(elem_bt));
4875       break;
4876   }
4877 }
4878 
4879 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4880                                                         XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4881   switch (elem_bt) {
4882     case T_BYTE:
4883       if (ideal_opc == Op_SaturatingAddV) {
4884         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4885       } else {
4886         assert(ideal_opc == Op_SaturatingSubV, "");
4887         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4888       }
4889       break;
4890     case T_SHORT:
4891       if (ideal_opc == Op_SaturatingAddV) {
4892         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4893       } else {
4894         assert(ideal_opc == Op_SaturatingSubV, "");
4895         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4896       }
4897       break;
4898     default:
4899       fatal("Unsupported type %s", type2name(elem_bt));
4900       break;
4901   }
4902 }
4903 
4904 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4905                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4906                                     bool is_varshift) {
4907   switch (ideal_opc) {
4908     case Op_AddVB:
4909       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4910     case Op_AddVS:
4911       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4912     case Op_AddVI:
4913       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4914     case Op_AddVL:
4915       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4916     case Op_AddVF:
4917       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4918     case Op_AddVD:
4919       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4920     case Op_SubVB:
4921       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4922     case Op_SubVS:
4923       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4924     case Op_SubVI:
4925       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4926     case Op_SubVL:
4927       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4928     case Op_SubVF:
4929       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4930     case Op_SubVD:
4931       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4932     case Op_MulVS:
4933       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4934     case Op_MulVI:
4935       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4936     case Op_MulVL:
4937       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4938     case Op_MulVF:
4939       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4940     case Op_MulVD:
4941       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4942     case Op_DivVF:
4943       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4944     case Op_DivVD:
4945       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4946     case Op_SqrtVF:
4947       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4948     case Op_SqrtVD:
4949       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4950     case Op_AbsVB:
4951       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4952     case Op_AbsVS:
4953       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4954     case Op_AbsVI:
4955       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4956     case Op_AbsVL:
4957       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4958     case Op_FmaVF:
4959       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4960     case Op_FmaVD:
4961       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4962     case Op_VectorRearrange:
4963       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4964     case Op_LShiftVS:
4965       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4966     case Op_LShiftVI:
4967       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4968     case Op_LShiftVL:
4969       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4970     case Op_RShiftVS:
4971       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4972     case Op_RShiftVI:
4973       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4974     case Op_RShiftVL:
4975       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4976     case Op_URShiftVS:
4977       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4978     case Op_URShiftVI:
4979       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4980     case Op_URShiftVL:
4981       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4982     case Op_RotateLeftV:
4983       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4984     case Op_RotateRightV:
4985       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4986     case Op_MaxV:
4987       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4988     case Op_MinV:
4989       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4990     case Op_UMinV:
4991       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4992     case Op_UMaxV:
4993       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4994     case Op_XorV:
4995       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4996     case Op_OrV:
4997       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4998     case Op_AndV:
4999       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5000     default:
5001       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
5002       break;
5003   }
5004 }
5005 
5006 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
5007                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
5008   switch (ideal_opc) {
5009     case Op_AddVB:
5010       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
5011     case Op_AddVS:
5012       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
5013     case Op_AddVI:
5014       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
5015     case Op_AddVL:
5016       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
5017     case Op_AddVF:
5018       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
5019     case Op_AddVD:
5020       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
5021     case Op_SubVB:
5022       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
5023     case Op_SubVS:
5024       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
5025     case Op_SubVI:
5026       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
5027     case Op_SubVL:
5028       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
5029     case Op_SubVF:
5030       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
5031     case Op_SubVD:
5032       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
5033     case Op_MulVS:
5034       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
5035     case Op_MulVI:
5036       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
5037     case Op_MulVL:
5038       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
5039     case Op_MulVF:
5040       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
5041     case Op_MulVD:
5042       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
5043     case Op_DivVF:
5044       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
5045     case Op_DivVD:
5046       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
5047     case Op_FmaVF:
5048       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
5049     case Op_FmaVD:
5050       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
5051     case Op_MaxV:
5052       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5053     case Op_MinV:
5054       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5055     case Op_UMaxV:
5056       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5057     case Op_UMinV:
5058       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5059     case Op_XorV:
5060       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5061     case Op_OrV:
5062       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5063     case Op_AndV:
5064       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5065     default:
5066       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
5067       break;
5068   }
5069 }
5070 
5071 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
5072                                   KRegister src1, KRegister src2) {
5073   BasicType etype = T_ILLEGAL;
5074   switch(mask_len) {
5075     case 2:
5076     case 4:
5077     case 8:  etype = T_BYTE; break;
5078     case 16: etype = T_SHORT; break;
5079     case 32: etype = T_INT; break;
5080     case 64: etype = T_LONG; break;
5081     default: fatal("Unsupported type"); break;
5082   }
5083   assert(etype != T_ILLEGAL, "");
5084   switch(ideal_opc) {
5085     case Op_AndVMask:
5086       kand(etype, dst, src1, src2); break;
5087     case Op_OrVMask:
5088       kor(etype, dst, src1, src2); break;
5089     case Op_XorVMask:
5090       kxor(etype, dst, src1, src2); break;
5091     default:
5092       fatal("Unsupported masked operation"); break;
5093   }
5094 }
5095 
5096 /*
5097  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5098  * If src is NaN, the result is 0.
5099  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
5100  * the result is equal to the value of Integer.MIN_VALUE.
5101  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
5102  * the result is equal to the value of Integer.MAX_VALUE.
5103  */
5104 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5105                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5106                                                                    Register rscratch, AddressLiteral float_sign_flip,
5107                                                                    int vec_enc) {
5108   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5109   Label done;
5110   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
5111   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
5112   vptest(xtmp2, xtmp2, vec_enc);
5113   jccb(Assembler::equal, done);
5114 
5115   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
5116   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
5117 
5118   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5119   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
5120   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
5121 
5122   // Recompute the mask for remaining special value.
5123   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
5124   // Extract SRC values corresponding to TRUE mask lanes.
5125   vpand(xtmp4, xtmp2, src, vec_enc);
5126   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
5127   // values are set.
5128   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
5129 
5130   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
5131   bind(done);
5132 }
5133 
5134 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5135                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5136                                                                     Register rscratch, AddressLiteral float_sign_flip,
5137                                                                     int vec_enc) {
5138   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5139   Label done;
5140   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5141   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5142   kortestwl(ktmp1, ktmp1);
5143   jccb(Assembler::equal, done);
5144 
5145   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5146   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5147   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5148 
5149   kxorwl(ktmp1, ktmp1, ktmp2);
5150   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5151   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5152   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5153   bind(done);
5154 }
5155 
5156 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5157                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5158                                                                      Register rscratch, AddressLiteral double_sign_flip,
5159                                                                      int vec_enc) {
5160   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5161 
5162   Label done;
5163   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5164   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
5165   kortestwl(ktmp1, ktmp1);
5166   jccb(Assembler::equal, done);
5167 
5168   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5169   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5170   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5171 
5172   kxorwl(ktmp1, ktmp1, ktmp2);
5173   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5174   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5175   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5176   bind(done);
5177 }
5178 
5179 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5180                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5181                                                                      Register rscratch, AddressLiteral float_sign_flip,
5182                                                                      int vec_enc) {
5183   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5184   Label done;
5185   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5186   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5187   kortestwl(ktmp1, ktmp1);
5188   jccb(Assembler::equal, done);
5189 
5190   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5191   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5192   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5193 
5194   kxorwl(ktmp1, ktmp1, ktmp2);
5195   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5196   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5197   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5198   bind(done);
5199 }
5200 
5201 /*
5202  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5203  * If src is NaN, the result is 0.
5204  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
5205  * the result is equal to the value of Long.MIN_VALUE.
5206  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
5207  * the result is equal to the value of Long.MAX_VALUE.
5208  */
5209 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5210                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5211                                                                       Register rscratch, AddressLiteral double_sign_flip,
5212                                                                       int vec_enc) {
5213   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5214 
5215   Label done;
5216   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5217   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
5218   kortestwl(ktmp1, ktmp1);
5219   jccb(Assembler::equal, done);
5220 
5221   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5222   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5223   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5224 
5225   kxorwl(ktmp1, ktmp1, ktmp2);
5226   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5227   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5228   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5229   bind(done);
5230 }
5231 
5232 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5233                                                              XMMRegister xtmp, int index, int vec_enc) {
5234    assert(vec_enc < Assembler::AVX_512bit, "");
5235    if (vec_enc == Assembler::AVX_256bit) {
5236      vextractf128_high(xtmp, src);
5237      vshufps(dst, src, xtmp, index, vec_enc);
5238    } else {
5239      vshufps(dst, src, zero, index, vec_enc);
5240    }
5241 }
5242 
5243 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5244                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5245                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
5246   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5247 
5248   Label done;
5249   // Compare the destination lanes with float_sign_flip
5250   // value to get mask for all special values.
5251   movdqu(xtmp1, float_sign_flip, rscratch);
5252   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5253   ptest(xtmp2, xtmp2);
5254   jccb(Assembler::equal, done);
5255 
5256   // Flip float_sign_flip to get max integer value.
5257   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5258   pxor(xtmp1, xtmp4);
5259 
5260   // Set detination lanes corresponding to unordered source lanes as zero.
5261   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5262   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5263 
5264   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5265   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5266   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5267 
5268   // Recompute the mask for remaining special value.
5269   pxor(xtmp2, xtmp3);
5270   // Extract mask corresponding to non-negative source lanes.
5271   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5272 
5273   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5274   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5275   pand(xtmp3, xtmp2);
5276 
5277   // Replace destination lanes holding special value(0x80000000) with max int
5278   // if corresponding source lane holds a +ve value.
5279   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5280   bind(done);
5281 }
5282 
5283 
5284 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5285                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5286   switch(to_elem_bt) {
5287     case T_SHORT:
5288       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5289       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5290       vpackusdw(dst, dst, zero, vec_enc);
5291       if (vec_enc == Assembler::AVX_256bit) {
5292         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5293       }
5294       break;
5295     case  T_BYTE:
5296       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5297       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5298       vpackusdw(dst, dst, zero, vec_enc);
5299       if (vec_enc == Assembler::AVX_256bit) {
5300         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5301       }
5302       vpackuswb(dst, dst, zero, vec_enc);
5303       break;
5304     default: assert(false, "%s", type2name(to_elem_bt));
5305   }
5306 }
5307 
5308 /*
5309  * Algorithm for vector D2L and F2I conversions:-
5310  * a) Perform vector D2L/F2I cast.
5311  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5312  *    It signifies that source value could be any of the special floating point
5313  *    values(NaN,-Inf,Inf,Max,-Min).
5314  * c) Set destination to zero if source is NaN value.
5315  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5316  */
5317 
5318 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5319                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5320                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5321   int to_elem_sz = type2aelembytes(to_elem_bt);
5322   assert(to_elem_sz <= 4, "");
5323   vcvttps2dq(dst, src, vec_enc);
5324   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5325   if (to_elem_sz < 4) {
5326     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5327     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5328   }
5329 }
5330 
5331 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5332                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5333                                             Register rscratch, int vec_enc) {
5334   int to_elem_sz = type2aelembytes(to_elem_bt);
5335   assert(to_elem_sz <= 4, "");
5336   vcvttps2dq(dst, src, vec_enc);
5337   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5338   switch(to_elem_bt) {
5339     case T_INT:
5340       break;
5341     case T_SHORT:
5342       evpmovdw(dst, dst, vec_enc);
5343       break;
5344     case T_BYTE:
5345       evpmovdb(dst, dst, vec_enc);
5346       break;
5347     default: assert(false, "%s", type2name(to_elem_bt));
5348   }
5349 }
5350 
5351 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5352                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5353                                             Register rscratch, int vec_enc) {
5354   evcvttps2qq(dst, src, vec_enc);
5355   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5356 }
5357 
5358 // Handling for downcasting from double to integer or sub-word types on AVX2.
5359 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5360                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5361                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5362   int to_elem_sz = type2aelembytes(to_elem_bt);
5363   assert(to_elem_sz < 8, "");
5364   vcvttpd2dq(dst, src, vec_enc);
5365   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5366                                               float_sign_flip, vec_enc);
5367   if (to_elem_sz < 4) {
5368     // xtmp4 holds all zero lanes.
5369     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5370   }
5371 }
5372 
5373 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5374                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5375                                             KRegister ktmp2, AddressLiteral sign_flip,
5376                                             Register rscratch, int vec_enc) {
5377   if (VM_Version::supports_avx512dq()) {
5378     evcvttpd2qq(dst, src, vec_enc);
5379     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5380     switch(to_elem_bt) {
5381       case T_LONG:
5382         break;
5383       case T_INT:
5384         evpmovsqd(dst, dst, vec_enc);
5385         break;
5386       case T_SHORT:
5387         evpmovsqd(dst, dst, vec_enc);
5388         evpmovdw(dst, dst, vec_enc);
5389         break;
5390       case T_BYTE:
5391         evpmovsqd(dst, dst, vec_enc);
5392         evpmovdb(dst, dst, vec_enc);
5393         break;
5394       default: assert(false, "%s", type2name(to_elem_bt));
5395     }
5396   } else {
5397     assert(type2aelembytes(to_elem_bt) <= 4, "");
5398     vcvttpd2dq(dst, src, vec_enc);
5399     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5400     switch(to_elem_bt) {
5401       case T_INT:
5402         break;
5403       case T_SHORT:
5404         evpmovdw(dst, dst, vec_enc);
5405         break;
5406       case T_BYTE:
5407         evpmovdb(dst, dst, vec_enc);
5408         break;
5409       default: assert(false, "%s", type2name(to_elem_bt));
5410     }
5411   }
5412 }
5413 
5414 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5415                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5416                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5417   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5418   // and re-instantiate original MXCSR.RC mode after that.
5419   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5420 
5421   mov64(tmp, julong_cast(0.5L));
5422   evpbroadcastq(xtmp1, tmp, vec_enc);
5423   vaddpd(xtmp1, src , xtmp1, vec_enc);
5424   evcvtpd2qq(dst, xtmp1, vec_enc);
5425   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5426                                                 double_sign_flip, vec_enc);;
5427 
5428   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5429 }
5430 
5431 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5432                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5433                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5434   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5435   // and re-instantiate original MXCSR.RC mode after that.
5436   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5437 
5438   movl(tmp, jint_cast(0.5));
5439   movq(xtmp1, tmp);
5440   vbroadcastss(xtmp1, xtmp1, vec_enc);
5441   vaddps(xtmp1, src , xtmp1, vec_enc);
5442   vcvtps2dq(dst, xtmp1, vec_enc);
5443   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5444                                               float_sign_flip, vec_enc);
5445 
5446   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5447 }
5448 
5449 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5450                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5451                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5452   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5453   // and re-instantiate original MXCSR.RC mode after that.
5454   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5455 
5456   movl(tmp, jint_cast(0.5));
5457   movq(xtmp1, tmp);
5458   vbroadcastss(xtmp1, xtmp1, vec_enc);
5459   vaddps(xtmp1, src , xtmp1, vec_enc);
5460   vcvtps2dq(dst, xtmp1, vec_enc);
5461   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5462 
5463   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5464 }
5465 
5466 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5467                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5468   switch (from_elem_bt) {
5469     case T_BYTE:
5470       switch (to_elem_bt) {
5471         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5472         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5473         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5474         default: ShouldNotReachHere();
5475       }
5476       break;
5477     case T_SHORT:
5478       switch (to_elem_bt) {
5479         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5480         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5481         default: ShouldNotReachHere();
5482       }
5483       break;
5484     case T_INT:
5485       assert(to_elem_bt == T_LONG, "");
5486       vpmovzxdq(dst, src, vlen_enc);
5487       break;
5488     default:
5489       ShouldNotReachHere();
5490   }
5491 }
5492 
5493 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5494                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5495   switch (from_elem_bt) {
5496     case T_BYTE:
5497       switch (to_elem_bt) {
5498         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5499         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5500         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5501         default: ShouldNotReachHere();
5502       }
5503       break;
5504     case T_SHORT:
5505       switch (to_elem_bt) {
5506         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5507         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5508         default: ShouldNotReachHere();
5509       }
5510       break;
5511     case T_INT:
5512       assert(to_elem_bt == T_LONG, "");
5513       vpmovsxdq(dst, src, vlen_enc);
5514       break;
5515     default:
5516       ShouldNotReachHere();
5517   }
5518 }
5519 
5520 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5521                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5522   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5523   assert(vlen_enc != AVX_512bit, "");
5524 
5525   int dst_bt_size = type2aelembytes(dst_bt);
5526   int src_bt_size = type2aelembytes(src_bt);
5527   if (dst_bt_size > src_bt_size) {
5528     switch (dst_bt_size / src_bt_size) {
5529       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5530       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5531       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5532       default: ShouldNotReachHere();
5533     }
5534   } else {
5535     assert(dst_bt_size < src_bt_size, "");
5536     switch (src_bt_size / dst_bt_size) {
5537       case 2: {
5538         if (vlen_enc == AVX_128bit) {
5539           vpacksswb(dst, src, src, vlen_enc);
5540         } else {
5541           vpacksswb(dst, src, src, vlen_enc);
5542           vpermq(dst, dst, 0x08, vlen_enc);
5543         }
5544         break;
5545       }
5546       case 4: {
5547         if (vlen_enc == AVX_128bit) {
5548           vpackssdw(dst, src, src, vlen_enc);
5549           vpacksswb(dst, dst, dst, vlen_enc);
5550         } else {
5551           vpackssdw(dst, src, src, vlen_enc);
5552           vpermq(dst, dst, 0x08, vlen_enc);
5553           vpacksswb(dst, dst, dst, AVX_128bit);
5554         }
5555         break;
5556       }
5557       case 8: {
5558         if (vlen_enc == AVX_128bit) {
5559           vpshufd(dst, src, 0x08, vlen_enc);
5560           vpackssdw(dst, dst, dst, vlen_enc);
5561           vpacksswb(dst, dst, dst, vlen_enc);
5562         } else {
5563           vpshufd(dst, src, 0x08, vlen_enc);
5564           vpermq(dst, dst, 0x08, vlen_enc);
5565           vpackssdw(dst, dst, dst, AVX_128bit);
5566           vpacksswb(dst, dst, dst, AVX_128bit);
5567         }
5568         break;
5569       }
5570       default: ShouldNotReachHere();
5571     }
5572   }
5573 }
5574 
5575 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5576                                    bool merge, BasicType bt, int vlen_enc) {
5577   if (bt == T_INT) {
5578     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5579   } else {
5580     assert(bt == T_LONG, "");
5581     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5582   }
5583 }
5584 
5585 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5586                                    bool merge, BasicType bt, int vlen_enc) {
5587   if (bt == T_INT) {
5588     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5589   } else {
5590     assert(bt == T_LONG, "");
5591     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5592   }
5593 }
5594 
5595 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5596                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5597                                                int vec_enc) {
5598   int index = 0;
5599   int vindex = 0;
5600   mov64(rtmp1, 0x0101010101010101L);
5601   pdepq(rtmp1, src, rtmp1);
5602   if (mask_len > 8) {
5603     movq(rtmp2, src);
5604     vpxor(xtmp, xtmp, xtmp, vec_enc);
5605     movq(xtmp, rtmp1);
5606   }
5607   movq(dst, rtmp1);
5608 
5609   mask_len -= 8;
5610   while (mask_len > 0) {
5611     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5612     index++;
5613     if ((index % 2) == 0) {
5614       pxor(xtmp, xtmp);
5615     }
5616     mov64(rtmp1, 0x0101010101010101L);
5617     shrq(rtmp2, 8);
5618     pdepq(rtmp1, rtmp2, rtmp1);
5619     pinsrq(xtmp, rtmp1, index % 2);
5620     vindex = index / 2;
5621     if (vindex) {
5622       // Write entire 16 byte vector when both 64 bit
5623       // lanes are update to save redundant instructions.
5624       if (index % 2) {
5625         vinsertf128(dst, dst, xtmp, vindex);
5626       }
5627     } else {
5628       vmovdqu(dst, xtmp);
5629     }
5630     mask_len -= 8;
5631   }
5632 }
5633 
5634 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5635   switch(opc) {
5636     case Op_VectorMaskTrueCount:
5637       popcntq(dst, tmp);
5638       break;
5639     case Op_VectorMaskLastTrue:
5640       if (VM_Version::supports_lzcnt()) {
5641         lzcntq(tmp, tmp);
5642         movl(dst, 63);
5643         subl(dst, tmp);
5644       } else {
5645         movl(dst, -1);
5646         bsrq(tmp, tmp);
5647         cmov32(Assembler::notZero, dst, tmp);
5648       }
5649       break;
5650     case Op_VectorMaskFirstTrue:
5651       if (VM_Version::supports_bmi1()) {
5652         if (masklen < 32) {
5653           orl(tmp, 1 << masklen);
5654           tzcntl(dst, tmp);
5655         } else if (masklen == 32) {
5656           tzcntl(dst, tmp);
5657         } else {
5658           assert(masklen == 64, "");
5659           tzcntq(dst, tmp);
5660         }
5661       } else {
5662         if (masklen < 32) {
5663           orl(tmp, 1 << masklen);
5664           bsfl(dst, tmp);
5665         } else {
5666           assert(masklen == 32 || masklen == 64, "");
5667           movl(dst, masklen);
5668           if (masklen == 32)  {
5669             bsfl(tmp, tmp);
5670           } else {
5671             bsfq(tmp, tmp);
5672           }
5673           cmov32(Assembler::notZero, dst, tmp);
5674         }
5675       }
5676       break;
5677     case Op_VectorMaskToLong:
5678       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5679       break;
5680     default: assert(false, "Unhandled mask operation");
5681   }
5682 }
5683 
5684 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5685                                               int masklen, int masksize, int vec_enc) {
5686   assert(VM_Version::supports_popcnt(), "");
5687 
5688   if(VM_Version::supports_avx512bw()) {
5689     kmovql(tmp, mask);
5690   } else {
5691     assert(masklen <= 16, "");
5692     kmovwl(tmp, mask);
5693   }
5694 
5695   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5696   // operations needs to be clipped.
5697   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5698     andq(tmp, (1 << masklen) - 1);
5699   }
5700 
5701   vector_mask_operation_helper(opc, dst, tmp, masklen);
5702 }
5703 
5704 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5705                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5706   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5707          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5708   assert(VM_Version::supports_popcnt(), "");
5709 
5710   bool need_clip = false;
5711   switch(bt) {
5712     case T_BOOLEAN:
5713       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5714       vpxor(xtmp, xtmp, xtmp, vec_enc);
5715       vpsubb(xtmp, xtmp, mask, vec_enc);
5716       vpmovmskb(tmp, xtmp, vec_enc);
5717       need_clip = masklen < 16;
5718       break;
5719     case T_BYTE:
5720       vpmovmskb(tmp, mask, vec_enc);
5721       need_clip = masklen < 16;
5722       break;
5723     case T_SHORT:
5724       vpacksswb(xtmp, mask, mask, vec_enc);
5725       if (masklen >= 16) {
5726         vpermpd(xtmp, xtmp, 8, vec_enc);
5727       }
5728       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5729       need_clip = masklen < 16;
5730       break;
5731     case T_INT:
5732     case T_FLOAT:
5733       vmovmskps(tmp, mask, vec_enc);
5734       need_clip = masklen < 4;
5735       break;
5736     case T_LONG:
5737     case T_DOUBLE:
5738       vmovmskpd(tmp, mask, vec_enc);
5739       need_clip = masklen < 2;
5740       break;
5741     default: assert(false, "Unhandled type, %s", type2name(bt));
5742   }
5743 
5744   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5745   // operations needs to be clipped.
5746   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5747     // need_clip implies masklen < 32
5748     andq(tmp, (1 << masklen) - 1);
5749   }
5750 
5751   vector_mask_operation_helper(opc, dst, tmp, masklen);
5752 }
5753 
5754 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5755                                              Register rtmp2, int mask_len) {
5756   kmov(rtmp1, src);
5757   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5758   mov64(rtmp2, -1L);
5759   pextq(rtmp2, rtmp2, rtmp1);
5760   kmov(dst, rtmp2);
5761 }
5762 
5763 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5764                                                     XMMRegister mask, Register rtmp, Register rscratch,
5765                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5766                                                     int vec_enc) {
5767   assert(type2aelembytes(bt) >= 4, "");
5768   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5769   address compress_perm_table = nullptr;
5770   address expand_perm_table = nullptr;
5771   if (type2aelembytes(bt) == 8) {
5772     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5773     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5774     vmovmskpd(rtmp, mask, vec_enc);
5775   } else {
5776     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5777     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5778     vmovmskps(rtmp, mask, vec_enc);
5779   }
5780   shlq(rtmp, 5); // for 32 byte permute row.
5781   if (opcode == Op_CompressV) {
5782     lea(rscratch, ExternalAddress(compress_perm_table));
5783   } else {
5784     lea(rscratch, ExternalAddress(expand_perm_table));
5785   }
5786   addptr(rtmp, rscratch);
5787   vmovdqu(permv, Address(rtmp));
5788   vpermps(dst, permv, src, Assembler::AVX_256bit);
5789   vpxor(xtmp, xtmp, xtmp, vec_enc);
5790   // Blend the result with zero vector using permute mask, each column entry
5791   // in a permute table row contains either a valid permute index or a -1 (default)
5792   // value, this can potentially be used as a blending mask after
5793   // compressing/expanding the source vector lanes.
5794   vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv);
5795 }
5796 
5797 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5798                                                bool merge, BasicType bt, int vec_enc) {
5799   if (opcode == Op_CompressV) {
5800     switch(bt) {
5801     case T_BYTE:
5802       evpcompressb(dst, mask, src, merge, vec_enc);
5803       break;
5804     case T_CHAR:
5805     case T_SHORT:
5806       evpcompressw(dst, mask, src, merge, vec_enc);
5807       break;
5808     case T_INT:
5809       evpcompressd(dst, mask, src, merge, vec_enc);
5810       break;
5811     case T_FLOAT:
5812       evcompressps(dst, mask, src, merge, vec_enc);
5813       break;
5814     case T_LONG:
5815       evpcompressq(dst, mask, src, merge, vec_enc);
5816       break;
5817     case T_DOUBLE:
5818       evcompresspd(dst, mask, src, merge, vec_enc);
5819       break;
5820     default:
5821       fatal("Unsupported type %s", type2name(bt));
5822       break;
5823     }
5824   } else {
5825     assert(opcode == Op_ExpandV, "");
5826     switch(bt) {
5827     case T_BYTE:
5828       evpexpandb(dst, mask, src, merge, vec_enc);
5829       break;
5830     case T_CHAR:
5831     case T_SHORT:
5832       evpexpandw(dst, mask, src, merge, vec_enc);
5833       break;
5834     case T_INT:
5835       evpexpandd(dst, mask, src, merge, vec_enc);
5836       break;
5837     case T_FLOAT:
5838       evexpandps(dst, mask, src, merge, vec_enc);
5839       break;
5840     case T_LONG:
5841       evpexpandq(dst, mask, src, merge, vec_enc);
5842       break;
5843     case T_DOUBLE:
5844       evexpandpd(dst, mask, src, merge, vec_enc);
5845       break;
5846     default:
5847       fatal("Unsupported type %s", type2name(bt));
5848       break;
5849     }
5850   }
5851 }
5852 
5853 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5854                                            KRegister ktmp1, int vec_enc) {
5855   if (opcode == Op_SignumVD) {
5856     vsubpd(dst, zero, one, vec_enc);
5857     // if src < 0 ? -1 : 1
5858     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5859     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5860     // if src == NaN, -0.0 or 0.0 return src.
5861     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5862     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5863   } else {
5864     assert(opcode == Op_SignumVF, "");
5865     vsubps(dst, zero, one, vec_enc);
5866     // if src < 0 ? -1 : 1
5867     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5868     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5869     // if src == NaN, -0.0 or 0.0 return src.
5870     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5871     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5872   }
5873 }
5874 
5875 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5876                                           XMMRegister xtmp1, int vec_enc) {
5877   if (opcode == Op_SignumVD) {
5878     vsubpd(dst, zero, one, vec_enc);
5879     // if src < 0 ? -1 : 1
5880     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5881     // if src == NaN, -0.0 or 0.0 return src.
5882     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5883     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5884   } else {
5885     assert(opcode == Op_SignumVF, "");
5886     vsubps(dst, zero, one, vec_enc);
5887     // if src < 0 ? -1 : 1
5888     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5889     // if src == NaN, -0.0 or 0.0 return src.
5890     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5891     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5892   }
5893 }
5894 
5895 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5896   if (VM_Version::supports_avx512bw()) {
5897     if (mask_len > 32) {
5898       kmovql(dst, src);
5899     } else {
5900       kmovdl(dst, src);
5901       if (mask_len != 32) {
5902         kshiftrdl(dst, dst, 32 - mask_len);
5903       }
5904     }
5905   } else {
5906     assert(mask_len <= 16, "");
5907     kmovwl(dst, src);
5908     if (mask_len != 16) {
5909       kshiftrwl(dst, dst, 16 - mask_len);
5910     }
5911   }
5912 }
5913 
5914 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5915   int lane_size = type2aelembytes(bt);
5916   if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5917       (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) {
5918     movptr(rtmp, imm32);
5919     switch(lane_size) {
5920       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5921       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5922       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5923       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5924       fatal("Unsupported lane size %d", lane_size);
5925       break;
5926     }
5927   } else {
5928     movptr(rtmp, imm32);
5929     movq(dst, rtmp);
5930     switch(lane_size) {
5931       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5932       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5933       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5934       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5935       fatal("Unsupported lane size %d", lane_size);
5936       break;
5937     }
5938   }
5939 }
5940 
5941 //
5942 // Following is lookup table based popcount computation algorithm:-
5943 //       Index   Bit set count
5944 //     [ 0000 ->   0,
5945 //       0001 ->   1,
5946 //       0010 ->   1,
5947 //       0011 ->   2,
5948 //       0100 ->   1,
5949 //       0101 ->   2,
5950 //       0110 ->   2,
5951 //       0111 ->   3,
5952 //       1000 ->   1,
5953 //       1001 ->   2,
5954 //       1010 ->   3,
5955 //       1011 ->   3,
5956 //       1100 ->   2,
5957 //       1101 ->   3,
5958 //       1111 ->   4 ]
5959 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5960 //     shuffle indices for lookup table access.
5961 //  b. Right shift each byte of vector lane by 4 positions.
5962 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5963 //     shuffle indices for lookup table access.
5964 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5965 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5966 //     count of all the bytes of a quadword.
5967 //  f. Perform step e. for upper 128bit vector lane.
5968 //  g. Pack the bitset count of quadwords back to double word.
5969 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5970 
5971 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5972                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5973   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5974   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5975   vpsrlw(dst, src, 4, vec_enc);
5976   vpand(dst, dst, xtmp1, vec_enc);
5977   vpand(xtmp1, src, xtmp1, vec_enc);
5978   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5979   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5980   vpshufb(dst, xtmp2, dst, vec_enc);
5981   vpaddb(dst, dst, xtmp1, vec_enc);
5982 }
5983 
5984 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5985                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5986   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5987   // Following code is as per steps e,f,g and h of above algorithm.
5988   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5989   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5990   vpsadbw(dst, dst, xtmp2, vec_enc);
5991   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5992   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5993   vpackuswb(dst, xtmp1, dst, vec_enc);
5994 }
5995 
5996 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5997                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5998   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5999   // Add the popcount of upper and lower bytes of word.
6000   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
6001   vpsrlw(dst, xtmp1, 8, vec_enc);
6002   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
6003   vpaddw(dst, dst, xtmp1, vec_enc);
6004 }
6005 
6006 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6007                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
6008   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
6009   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6010   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
6011 }
6012 
6013 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6014                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
6015   switch(bt) {
6016     case T_LONG:
6017       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
6018       break;
6019     case T_INT:
6020       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
6021       break;
6022     case T_CHAR:
6023     case T_SHORT:
6024       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
6025       break;
6026     case T_BYTE:
6027     case T_BOOLEAN:
6028       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
6029       break;
6030     default:
6031       fatal("Unsupported type %s", type2name(bt));
6032       break;
6033   }
6034 }
6035 
6036 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6037                                                       KRegister mask, bool merge, int vec_enc) {
6038   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6039   switch(bt) {
6040     case T_LONG:
6041       assert(VM_Version::supports_avx512_vpopcntdq(), "");
6042       evpopcntq(dst, mask, src, merge, vec_enc);
6043       break;
6044     case T_INT:
6045       assert(VM_Version::supports_avx512_vpopcntdq(), "");
6046       evpopcntd(dst, mask, src, merge, vec_enc);
6047       break;
6048     case T_CHAR:
6049     case T_SHORT:
6050       assert(VM_Version::supports_avx512_bitalg(), "");
6051       evpopcntw(dst, mask, src, merge, vec_enc);
6052       break;
6053     case T_BYTE:
6054     case T_BOOLEAN:
6055       assert(VM_Version::supports_avx512_bitalg(), "");
6056       evpopcntb(dst, mask, src, merge, vec_enc);
6057       break;
6058     default:
6059       fatal("Unsupported type %s", type2name(bt));
6060       break;
6061   }
6062 }
6063 
6064 // Bit reversal algorithm first reverses the bits of each byte followed by
6065 // a byte level reversal for multi-byte primitive types (short/int/long).
6066 // Algorithm performs a lookup table access to get reverse bit sequence
6067 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
6068 // is obtained by swapping the reverse bit sequences of upper and lower
6069 // nibble of a byte.
6070 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6071                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
6072   if (VM_Version::supports_avx512vlbw()) {
6073 
6074     // Get the reverse bit sequence of lower nibble of each byte.
6075     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
6076     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6077     evpandq(dst, xtmp2, src, vec_enc);
6078     vpshufb(dst, xtmp1, dst, vec_enc);
6079     vpsllq(dst, dst, 4, vec_enc);
6080 
6081     // Get the reverse bit sequence of upper nibble of each byte.
6082     vpandn(xtmp2, xtmp2, src, vec_enc);
6083     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6084     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6085 
6086     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6087     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6088     evporq(xtmp2, dst, xtmp2, vec_enc);
6089     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6090 
6091   } else if(vec_enc == Assembler::AVX_512bit) {
6092     // Shift based bit reversal.
6093     assert(bt == T_LONG || bt == T_INT, "");
6094 
6095     // Swap lower and upper nibble of each byte.
6096     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
6097 
6098     // Swap two least and most significant bits of each nibble.
6099     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
6100 
6101     // Swap adjacent pair of bits.
6102     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6103     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
6104 
6105     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6106     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
6107   } else {
6108     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
6109     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6110 
6111     // Get the reverse bit sequence of lower nibble of each byte.
6112     vpand(dst, xtmp2, src, vec_enc);
6113     vpshufb(dst, xtmp1, dst, vec_enc);
6114     vpsllq(dst, dst, 4, vec_enc);
6115 
6116     // Get the reverse bit sequence of upper nibble of each byte.
6117     vpandn(xtmp2, xtmp2, src, vec_enc);
6118     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6119     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6120 
6121     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6122     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6123     vpor(xtmp2, dst, xtmp2, vec_enc);
6124     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6125   }
6126 }
6127 
6128 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
6129                                                 XMMRegister xtmp, Register rscratch) {
6130   assert(VM_Version::supports_gfni(), "");
6131   assert(rscratch != noreg || always_reachable(mask), "missing");
6132 
6133   // Galois field instruction based bit reversal based on following algorithm.
6134   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6135   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
6136   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
6137   vector_reverse_byte(bt, dst, xtmp, vec_enc);
6138 }
6139 
6140 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
6141                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
6142   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
6143   evpandq(dst, xtmp1, src, vec_enc);
6144   vpsllq(dst, dst, nbits, vec_enc);
6145   vpandn(xtmp1, xtmp1, src, vec_enc);
6146   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
6147   evporq(dst, dst, xtmp1, vec_enc);
6148 }
6149 
6150 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6151                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
6152   // Shift based bit reversal.
6153   assert(VM_Version::supports_evex(), "");
6154   switch(bt) {
6155     case T_LONG:
6156       // Swap upper and lower double word of each quad word.
6157       evprorq(xtmp1, k0, src, 32, true, vec_enc);
6158       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
6159       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6160       break;
6161     case T_INT:
6162       // Swap upper and lower word of each double word.
6163       evprord(xtmp1, k0, src, 16, true, vec_enc);
6164       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6165       break;
6166     case T_CHAR:
6167     case T_SHORT:
6168       // Swap upper and lower byte of each word.
6169       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6170       break;
6171     case T_BYTE:
6172       evmovdquq(dst, k0, src, true, vec_enc);
6173       break;
6174     default:
6175       fatal("Unsupported type %s", type2name(bt));
6176       break;
6177   }
6178 }
6179 
6180 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6181   if (bt == T_BYTE) {
6182     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6183       evmovdquq(dst, k0, src, true, vec_enc);
6184     } else {
6185       vmovdqu(dst, src);
6186     }
6187     return;
6188   }
6189   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6190   // pre-computed shuffle indices.
6191   switch(bt) {
6192     case T_LONG:
6193       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6194       break;
6195     case T_INT:
6196       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6197       break;
6198     case T_CHAR:
6199     case T_SHORT:
6200       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6201       break;
6202     default:
6203       fatal("Unsupported type %s", type2name(bt));
6204       break;
6205   }
6206   vpshufb(dst, src, dst, vec_enc);
6207 }
6208 
6209 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6210                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6211                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6212   assert(is_integral_type(bt), "");
6213   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6214   assert(VM_Version::supports_avx512cd(), "");
6215   switch(bt) {
6216     case T_LONG:
6217       evplzcntq(dst, ktmp, src, merge, vec_enc);
6218       break;
6219     case T_INT:
6220       evplzcntd(dst, ktmp, src, merge, vec_enc);
6221       break;
6222     case T_SHORT:
6223       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6224       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6225       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6226       vpunpckhwd(dst, xtmp1, src, vec_enc);
6227       evplzcntd(dst, ktmp, dst, merge, vec_enc);
6228       vpackusdw(dst, xtmp2, dst, vec_enc);
6229       break;
6230     case T_BYTE:
6231       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6232       // accessing the lookup table.
6233       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6234       // accessing the lookup table.
6235       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6236       assert(VM_Version::supports_avx512bw(), "");
6237       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6238       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6239       vpand(xtmp2, dst, src, vec_enc);
6240       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6241       vpsrlw(xtmp3, src, 4, vec_enc);
6242       vpand(xtmp3, dst, xtmp3, vec_enc);
6243       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6244       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6245       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6246       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6247       break;
6248     default:
6249       fatal("Unsupported type %s", type2name(bt));
6250       break;
6251   }
6252 }
6253 
6254 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6255                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6256   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6257   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6258   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6259   // accessing the lookup table.
6260   vpand(dst, xtmp2, src, vec_enc);
6261   vpshufb(dst, xtmp1, dst, vec_enc);
6262   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6263   // accessing the lookup table.
6264   vpsrlw(xtmp3, src, 4, vec_enc);
6265   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6266   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6267   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6268   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6269   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6270   vpaddb(dst, dst, xtmp2, vec_enc);
6271   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6272 }
6273 
6274 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6275                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6276   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6277   // Add zero counts of lower byte and upper byte of a word if
6278   // upper byte holds a zero value.
6279   vpsrlw(xtmp3, src, 8, vec_enc);
6280   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6281   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6282   vpsllw(xtmp2, dst, 8, vec_enc);
6283   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6284   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6285   vpsrlw(dst, dst, 8, vec_enc);
6286 }
6287 
6288 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6289                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6290   // Since IEEE 754 floating point format represents mantissa in 1.0 format
6291   // hence biased exponent can be used to compute leading zero count as per
6292   // following formula:-
6293   // LZCNT = 31 - (biased_exp - 127)
6294   // Special handling has been introduced for Zero, Max_Int and -ve source values.
6295 
6296   // Broadcast 0xFF
6297   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6298   vpsrld(xtmp1, xtmp1, 24, vec_enc);
6299 
6300   // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher
6301   // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit
6302   // contributes to the leading number of zeros.
6303   vpsrld(xtmp2, src, 1, vec_enc);
6304   vpandn(xtmp3, xtmp2, src, vec_enc);
6305 
6306   // Extract biased exponent.
6307   vcvtdq2ps(dst, xtmp3, vec_enc);
6308   vpsrld(dst, dst, 23, vec_enc);
6309   vpand(dst, dst, xtmp1, vec_enc);
6310 
6311   // Broadcast 127.
6312   vpsrld(xtmp1, xtmp1, 1, vec_enc);
6313   // Exponent = biased_exp - 127
6314   vpsubd(dst, dst, xtmp1, vec_enc);
6315 
6316   // Exponent_plus_one = Exponent + 1
6317   vpsrld(xtmp3, xtmp1, 6, vec_enc);
6318   vpaddd(dst, dst, xtmp3, vec_enc);
6319 
6320   // Replace -ve exponent with zero, exponent is -ve when src
6321   // lane contains a zero value.
6322   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6323   vblendvps(dst, dst, xtmp2, dst, vec_enc);
6324 
6325   // Rematerialize broadcast 32.
6326   vpslld(xtmp1, xtmp3, 5, vec_enc);
6327   // Exponent is 32 if corresponding source lane contains max_int value.
6328   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
6329   // LZCNT = 32 - exponent_plus_one
6330   vpsubd(dst, xtmp1, dst, vec_enc);
6331 
6332   // Replace LZCNT with a value 1 if corresponding source lane
6333   // contains max_int value.
6334   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
6335 
6336   // Replace biased_exp with 0 if source lane value is less than zero.
6337   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6338   vblendvps(dst, dst, xtmp2, src, vec_enc);
6339 }
6340 
6341 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6342                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6343   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6344   // Add zero counts of lower word and upper word of a double word if
6345   // upper word holds a zero value.
6346   vpsrld(xtmp3, src, 16, vec_enc);
6347   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6348   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
6349   vpslld(xtmp2, dst, 16, vec_enc);
6350   vpaddd(xtmp2, xtmp2, dst, vec_enc);
6351   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6352   vpsrld(dst, dst, 16, vec_enc);
6353   // Add zero counts of lower doubleword and upper doubleword of a
6354   // quadword if upper doubleword holds a zero value.
6355   vpsrlq(xtmp3, src, 32, vec_enc);
6356   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
6357   vpsllq(xtmp2, dst, 32, vec_enc);
6358   vpaddq(xtmp2, xtmp2, dst, vec_enc);
6359   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6360   vpsrlq(dst, dst, 32, vec_enc);
6361 }
6362 
6363 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6364                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6365                                                        Register rtmp, int vec_enc) {
6366   assert(is_integral_type(bt), "unexpected type");
6367   assert(vec_enc < Assembler::AVX_512bit, "");
6368   switch(bt) {
6369     case T_LONG:
6370       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6371       break;
6372     case T_INT:
6373       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6374       break;
6375     case T_SHORT:
6376       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6377       break;
6378     case T_BYTE:
6379       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6380       break;
6381     default:
6382       fatal("Unsupported type %s", type2name(bt));
6383       break;
6384   }
6385 }
6386 
6387 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6388   switch(bt) {
6389     case T_BYTE:
6390       vpsubb(dst, src1, src2, vec_enc);
6391       break;
6392     case T_SHORT:
6393       vpsubw(dst, src1, src2, vec_enc);
6394       break;
6395     case T_INT:
6396       vpsubd(dst, src1, src2, vec_enc);
6397       break;
6398     case T_LONG:
6399       vpsubq(dst, src1, src2, vec_enc);
6400       break;
6401     default:
6402       fatal("Unsupported type %s", type2name(bt));
6403       break;
6404   }
6405 }
6406 
6407 // Trailing zero count computation is based on leading zero count operation as per
6408 // following equation. All AVX3 targets support AVX512CD feature which offers
6409 // direct vector instruction to compute leading zero count.
6410 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6411 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6412                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6413                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6414   assert(is_integral_type(bt), "");
6415   // xtmp = -1
6416   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6417   // xtmp = xtmp + src
6418   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6419   // xtmp = xtmp & ~src
6420   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6421   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6422   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6423   vpsub(bt, dst, xtmp4, dst, vec_enc);
6424 }
6425 
6426 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6427 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6428 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6429                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6430   assert(is_integral_type(bt), "");
6431   // xtmp = 0
6432   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6433   // xtmp = 0 - src
6434   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6435   // xtmp = xtmp | src
6436   vpor(xtmp3, xtmp3, src, vec_enc);
6437   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6438   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6439   vpsub(bt, dst, xtmp1, dst, vec_enc);
6440 }
6441 
6442 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6443   Label done;
6444   Label neg_divisor_fastpath;
6445   cmpl(divisor, 0);
6446   jccb(Assembler::less, neg_divisor_fastpath);
6447   xorl(rdx, rdx);
6448   divl(divisor);
6449   jmpb(done);
6450   bind(neg_divisor_fastpath);
6451   // Fastpath for divisor < 0:
6452   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6453   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6454   movl(rdx, rax);
6455   subl(rdx, divisor);
6456   if (VM_Version::supports_bmi1()) {
6457     andnl(rax, rdx, rax);
6458   } else {
6459     notl(rdx);
6460     andl(rax, rdx);
6461   }
6462   shrl(rax, 31);
6463   bind(done);
6464 }
6465 
6466 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6467   Label done;
6468   Label neg_divisor_fastpath;
6469   cmpl(divisor, 0);
6470   jccb(Assembler::less, neg_divisor_fastpath);
6471   xorl(rdx, rdx);
6472   divl(divisor);
6473   jmpb(done);
6474   bind(neg_divisor_fastpath);
6475   // Fastpath when divisor < 0:
6476   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6477   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6478   movl(rdx, rax);
6479   subl(rax, divisor);
6480   if (VM_Version::supports_bmi1()) {
6481     andnl(rax, rax, rdx);
6482   } else {
6483     notl(rax);
6484     andl(rax, rdx);
6485   }
6486   sarl(rax, 31);
6487   andl(rax, divisor);
6488   subl(rdx, rax);
6489   bind(done);
6490 }
6491 
6492 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6493   Label done;
6494   Label neg_divisor_fastpath;
6495 
6496   cmpl(divisor, 0);
6497   jccb(Assembler::less, neg_divisor_fastpath);
6498   xorl(rdx, rdx);
6499   divl(divisor);
6500   jmpb(done);
6501   bind(neg_divisor_fastpath);
6502   // Fastpath for divisor < 0:
6503   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6504   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6505   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6506   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6507   movl(rdx, rax);
6508   subl(rax, divisor);
6509   if (VM_Version::supports_bmi1()) {
6510     andnl(rax, rax, rdx);
6511   } else {
6512     notl(rax);
6513     andl(rax, rdx);
6514   }
6515   movl(tmp, rax);
6516   shrl(rax, 31); // quotient
6517   sarl(tmp, 31);
6518   andl(tmp, divisor);
6519   subl(rdx, tmp); // remainder
6520   bind(done);
6521 }
6522 
6523 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6524                                  XMMRegister xtmp2, Register rtmp) {
6525   if(VM_Version::supports_gfni()) {
6526     // Galois field instruction based bit reversal based on following algorithm.
6527     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6528     mov64(rtmp, 0x8040201008040201L);
6529     movq(xtmp1, src);
6530     movq(xtmp2, rtmp);
6531     gf2p8affineqb(xtmp1, xtmp2, 0);
6532     movq(dst, xtmp1);
6533   } else {
6534     // Swap even and odd numbered bits.
6535     movl(rtmp, src);
6536     andl(rtmp, 0x55555555);
6537     shll(rtmp, 1);
6538     movl(dst, src);
6539     andl(dst, 0xAAAAAAAA);
6540     shrl(dst, 1);
6541     orl(dst, rtmp);
6542 
6543     // Swap LSB and MSB 2 bits of each nibble.
6544     movl(rtmp, dst);
6545     andl(rtmp, 0x33333333);
6546     shll(rtmp, 2);
6547     andl(dst, 0xCCCCCCCC);
6548     shrl(dst, 2);
6549     orl(dst, rtmp);
6550 
6551     // Swap LSB and MSB 4 bits of each byte.
6552     movl(rtmp, dst);
6553     andl(rtmp, 0x0F0F0F0F);
6554     shll(rtmp, 4);
6555     andl(dst, 0xF0F0F0F0);
6556     shrl(dst, 4);
6557     orl(dst, rtmp);
6558   }
6559   bswapl(dst);
6560 }
6561 
6562 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6563                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6564   if(VM_Version::supports_gfni()) {
6565     // Galois field instruction based bit reversal based on following algorithm.
6566     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6567     mov64(rtmp1, 0x8040201008040201L);
6568     movq(xtmp1, src);
6569     movq(xtmp2, rtmp1);
6570     gf2p8affineqb(xtmp1, xtmp2, 0);
6571     movq(dst, xtmp1);
6572   } else {
6573     // Swap even and odd numbered bits.
6574     movq(rtmp1, src);
6575     mov64(rtmp2, 0x5555555555555555L);
6576     andq(rtmp1, rtmp2);
6577     shlq(rtmp1, 1);
6578     movq(dst, src);
6579     notq(rtmp2);
6580     andq(dst, rtmp2);
6581     shrq(dst, 1);
6582     orq(dst, rtmp1);
6583 
6584     // Swap LSB and MSB 2 bits of each nibble.
6585     movq(rtmp1, dst);
6586     mov64(rtmp2, 0x3333333333333333L);
6587     andq(rtmp1, rtmp2);
6588     shlq(rtmp1, 2);
6589     notq(rtmp2);
6590     andq(dst, rtmp2);
6591     shrq(dst, 2);
6592     orq(dst, rtmp1);
6593 
6594     // Swap LSB and MSB 4 bits of each byte.
6595     movq(rtmp1, dst);
6596     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6597     andq(rtmp1, rtmp2);
6598     shlq(rtmp1, 4);
6599     notq(rtmp2);
6600     andq(dst, rtmp2);
6601     shrq(dst, 4);
6602     orq(dst, rtmp1);
6603   }
6604   bswapq(dst);
6605 }
6606 
6607 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6608   Label done;
6609   Label neg_divisor_fastpath;
6610   cmpq(divisor, 0);
6611   jccb(Assembler::less, neg_divisor_fastpath);
6612   xorl(rdx, rdx);
6613   divq(divisor);
6614   jmpb(done);
6615   bind(neg_divisor_fastpath);
6616   // Fastpath for divisor < 0:
6617   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6618   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6619   movq(rdx, rax);
6620   subq(rdx, divisor);
6621   if (VM_Version::supports_bmi1()) {
6622     andnq(rax, rdx, rax);
6623   } else {
6624     notq(rdx);
6625     andq(rax, rdx);
6626   }
6627   shrq(rax, 63);
6628   bind(done);
6629 }
6630 
6631 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6632   Label done;
6633   Label neg_divisor_fastpath;
6634   cmpq(divisor, 0);
6635   jccb(Assembler::less, neg_divisor_fastpath);
6636   xorq(rdx, rdx);
6637   divq(divisor);
6638   jmp(done);
6639   bind(neg_divisor_fastpath);
6640   // Fastpath when divisor < 0:
6641   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6642   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6643   movq(rdx, rax);
6644   subq(rax, divisor);
6645   if (VM_Version::supports_bmi1()) {
6646     andnq(rax, rax, rdx);
6647   } else {
6648     notq(rax);
6649     andq(rax, rdx);
6650   }
6651   sarq(rax, 63);
6652   andq(rax, divisor);
6653   subq(rdx, rax);
6654   bind(done);
6655 }
6656 
6657 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6658   Label done;
6659   Label neg_divisor_fastpath;
6660   cmpq(divisor, 0);
6661   jccb(Assembler::less, neg_divisor_fastpath);
6662   xorq(rdx, rdx);
6663   divq(divisor);
6664   jmp(done);
6665   bind(neg_divisor_fastpath);
6666   // Fastpath for divisor < 0:
6667   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6668   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6669   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6670   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6671   movq(rdx, rax);
6672   subq(rax, divisor);
6673   if (VM_Version::supports_bmi1()) {
6674     andnq(rax, rax, rdx);
6675   } else {
6676     notq(rax);
6677     andq(rax, rdx);
6678   }
6679   movq(tmp, rax);
6680   shrq(rax, 63); // quotient
6681   sarq(tmp, 63);
6682   andq(tmp, divisor);
6683   subq(rdx, tmp); // remainder
6684   bind(done);
6685 }
6686 
6687 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6688                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6689                                         int vlen_enc) {
6690   assert(VM_Version::supports_avx512bw(), "");
6691   // Byte shuffles are inlane operations and indices are determined using
6692   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6693   // normalized to index range 0-15. This makes sure that all the multiples
6694   // of an index value are placed at same relative position in 128 bit
6695   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6696   // will be 16th element in their respective 128 bit lanes.
6697   movl(rtmp, 16);
6698   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6699 
6700   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6701   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6702   // original shuffle indices and move the shuffled lanes corresponding to true
6703   // mask to destination vector.
6704   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6705   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6706   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6707 
6708   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6709   // and broadcasting second 128 bit lane.
6710   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6711   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6712   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6713   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6714   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6715 
6716   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6717   // and broadcasting third 128 bit lane.
6718   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6719   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6720   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6721   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6722   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6723 
6724   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6725   // and broadcasting third 128 bit lane.
6726   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6727   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6728   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6729   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6730   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6731 }
6732 
6733 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6734                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6735   if (vlen_enc == AVX_128bit) {
6736     vpermilps(dst, src, shuffle, vlen_enc);
6737   } else if (bt == T_INT) {
6738     vpermd(dst, shuffle, src, vlen_enc);
6739   } else {
6740     assert(bt == T_FLOAT, "");
6741     vpermps(dst, shuffle, src, vlen_enc);
6742   }
6743 }
6744 
6745 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6746   switch(opcode) {
6747     case Op_AddHF: vaddsh(dst, src1, src2); break;
6748     case Op_SubHF: vsubsh(dst, src1, src2); break;
6749     case Op_MulHF: vmulsh(dst, src1, src2); break;
6750     case Op_DivHF: vdivsh(dst, src1, src2); break;
6751     default: assert(false, "%s", NodeClassNames[opcode]); break;
6752   }
6753 }
6754 
6755 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6756   switch(elem_bt) {
6757     case T_BYTE:
6758       if (ideal_opc == Op_SaturatingAddV) {
6759         vpaddsb(dst, src1, src2, vlen_enc);
6760       } else {
6761         assert(ideal_opc == Op_SaturatingSubV, "");
6762         vpsubsb(dst, src1, src2, vlen_enc);
6763       }
6764       break;
6765     case T_SHORT:
6766       if (ideal_opc == Op_SaturatingAddV) {
6767         vpaddsw(dst, src1, src2, vlen_enc);
6768       } else {
6769         assert(ideal_opc == Op_SaturatingSubV, "");
6770         vpsubsw(dst, src1, src2, vlen_enc);
6771       }
6772       break;
6773     default:
6774       fatal("Unsupported type %s", type2name(elem_bt));
6775       break;
6776   }
6777 }
6778 
6779 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6780   switch(elem_bt) {
6781     case T_BYTE:
6782       if (ideal_opc == Op_SaturatingAddV) {
6783         vpaddusb(dst, src1, src2, vlen_enc);
6784       } else {
6785         assert(ideal_opc == Op_SaturatingSubV, "");
6786         vpsubusb(dst, src1, src2, vlen_enc);
6787       }
6788       break;
6789     case T_SHORT:
6790       if (ideal_opc == Op_SaturatingAddV) {
6791         vpaddusw(dst, src1, src2, vlen_enc);
6792       } else {
6793         assert(ideal_opc == Op_SaturatingSubV, "");
6794         vpsubusw(dst, src1, src2, vlen_enc);
6795       }
6796       break;
6797     default:
6798       fatal("Unsupported type %s", type2name(elem_bt));
6799       break;
6800   }
6801 }
6802 
6803 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6804                                                               XMMRegister src2, KRegister ktmp, int vlen_enc) {
6805   // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6806   // overflow_mask = Inp1 <u Inp2
6807   evpcmpu(elem_bt, ktmp,  src2, src1, Assembler::lt, vlen_enc);
6808   // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6809   evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6810 }
6811 
6812 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6813                                                               XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6814   // Emulate unsigned comparison using signed comparison
6815   // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6816   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6817   vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6818   vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6819 
6820   vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6821 
6822   // Res = INP1 - INP2 (non-commutative and non-associative)
6823   vpsub(elem_bt, dst, src1, src2, vlen_enc);
6824   // Res = Mask ? Zero : Res
6825   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6826   vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6827 }
6828 
6829 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6830                                                                XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6831   // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6832   // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6833   // Res = Signed Add INP1, INP2
6834   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6835   // T1 = SRC1 | SRC2
6836   vpor(xtmp1, src1, src2, vlen_enc);
6837   // Max_Unsigned = -1
6838   vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6839   // Unsigned compare:  Mask = Res <u T1
6840   evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6841   // res  = Mask ? Max_Unsigned : Res
6842   evpblend(elem_bt, dst, ktmp,  dst, xtmp2, true, vlen_enc);
6843 }
6844 
6845 //
6846 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6847 // unsigned addition operation.
6848 //    overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6849 //
6850 // We empirically determined its semantic equivalence to following reduced expression
6851 //    overflow_mask =  (a + b) <u (a | b)
6852 //
6853 // and also verified it though Alive2 solver.
6854 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6855 //
6856 
6857 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6858                                                               XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6859   // Res = Signed Add INP1, INP2
6860   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6861   // Compute T1 = INP1 | INP2
6862   vpor(xtmp3, src1, src2, vlen_enc);
6863   // T1 = Minimum signed value.
6864   vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6865   // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6866   vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6867   // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6868   vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6869   // Compute overflow detection mask = Res<1> <s T1
6870   if (elem_bt == T_INT) {
6871     vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6872   } else {
6873     assert(elem_bt == T_LONG, "");
6874     vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6875   }
6876   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6877 }
6878 
6879 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6880                                       int vlen_enc, bool xtmp2_hold_M1) {
6881   if (VM_Version::supports_avx512dq()) {
6882     evpmovq2m(ktmp, src, vlen_enc);
6883   } else {
6884     assert(VM_Version::supports_evex(), "");
6885     if (!xtmp2_hold_M1) {
6886       vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6887     }
6888     evpsraq(xtmp1, src, 63, vlen_enc);
6889     evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6890   }
6891 }
6892 
6893 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6894                                       int vlen_enc, bool xtmp2_hold_M1) {
6895   if (VM_Version::supports_avx512dq()) {
6896     evpmovd2m(ktmp, src, vlen_enc);
6897   } else {
6898     assert(VM_Version::supports_evex(), "");
6899     if (!xtmp2_hold_M1) {
6900       vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6901     }
6902     vpsrad(xtmp1, src, 31, vlen_enc);
6903     Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6904   }
6905 }
6906 
6907 
6908 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6909   if (elem_bt == T_LONG) {
6910     if (VM_Version::supports_evex()) {
6911       evpsraq(dst, src, 63, vlen_enc);
6912     } else {
6913       vpsrad(dst, src, 31, vlen_enc);
6914       vpshufd(dst, dst, 0xF5, vlen_enc);
6915     }
6916   } else {
6917     assert(elem_bt == T_INT, "");
6918     vpsrad(dst, src, 31, vlen_enc);
6919   }
6920 }
6921 
6922 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6923   if (compute_allones) {
6924     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6925       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6926     } else {
6927       vpcmpeqq(allones, allones, allones, vlen_enc);
6928     }
6929   }
6930   if (elem_bt == T_LONG) {
6931     vpsrlq(dst, allones, 1, vlen_enc);
6932   } else {
6933     assert(elem_bt == T_INT, "");
6934     vpsrld(dst, allones, 1, vlen_enc);
6935   }
6936 }
6937 
6938 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6939   if (compute_allones) {
6940     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6941       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6942     } else {
6943       vpcmpeqq(allones, allones, allones, vlen_enc);
6944     }
6945   }
6946   if (elem_bt == T_LONG) {
6947     vpsllq(dst, allones, 63, vlen_enc);
6948   } else {
6949     assert(elem_bt == T_INT, "");
6950     vpslld(dst, allones, 31, vlen_enc);
6951   }
6952 }
6953 
6954 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask,  XMMRegister src1, XMMRegister src2,
6955                                 Assembler::ComparisonPredicate cond, int vlen_enc) {
6956   switch(elem_bt) {
6957     case T_LONG:  evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6958     case T_INT:   evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6959     case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6960     case T_BYTE:  evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6961     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6962   }
6963 }
6964 
6965 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6966   switch(elem_bt) {
6967     case  T_LONG:  vpcmpgtq(dst, src1, src2, vlen_enc); break;
6968     case  T_INT:   vpcmpgtd(dst, src1, src2, vlen_enc); break;
6969     case  T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6970     case  T_BYTE:  vpcmpgtb(dst, src1, src2, vlen_enc); break;
6971     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6972   }
6973 }
6974 
6975 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6976                                            XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6977   if (elem_bt == T_LONG) {
6978     evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6979   } else {
6980     assert(elem_bt == T_INT, "");
6981     evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6982   }
6983 }
6984 
6985 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6986                                                          XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6987                                                          KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6988   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6989   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6990   // Overflow detection based on Hacker's delight section 2-13.
6991   if (ideal_opc == Op_SaturatingAddV) {
6992     // res = src1 + src2
6993     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6994     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6995     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6996     vpxor(xtmp1, dst, src1, vlen_enc);
6997     vpxor(xtmp2, dst, src2, vlen_enc);
6998     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6999   } else {
7000     assert(ideal_opc == Op_SaturatingSubV, "");
7001     // res = src1 - src2
7002     vpsub(elem_bt, dst, src1, src2, vlen_enc);
7003     // Overflow occurs when both inputs have opposite polarity and
7004     // result polarity does not comply with first input polarity.
7005     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
7006     vpxor(xtmp1, src1, src2, vlen_enc);
7007     vpxor(xtmp2, dst, src1, vlen_enc);
7008     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
7009   }
7010 
7011   // Compute overflow detection mask.
7012   evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
7013   // Note: xtmp1 hold -1 in all its lanes after above call.
7014 
7015   // Compute mask based on first input polarity.
7016   evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
7017 
7018   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
7019   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
7020 
7021   // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
7022   // set bits in first input polarity mask holds a min value.
7023   evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
7024   // Blend destination lanes with saturated values using overflow detection mask.
7025   evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
7026 }
7027 
7028 
7029 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
7030                                                         XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
7031                                                         XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
7032   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
7033   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
7034   // Overflow detection based on Hacker's delight section 2-13.
7035   if (ideal_opc == Op_SaturatingAddV) {
7036     // res = src1 + src2
7037     vpadd(elem_bt, dst, src1, src2, vlen_enc);
7038     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
7039     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
7040     vpxor(xtmp1, dst, src1, vlen_enc);
7041     vpxor(xtmp2, dst, src2, vlen_enc);
7042     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
7043   } else {
7044     assert(ideal_opc == Op_SaturatingSubV, "");
7045     // res = src1 - src2
7046     vpsub(elem_bt, dst, src1, src2, vlen_enc);
7047     // Overflow occurs when both inputs have opposite polarity and
7048     // result polarity does not comply with first input polarity.
7049     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
7050     vpxor(xtmp1, src1, src2, vlen_enc);
7051     vpxor(xtmp2, dst, src1, vlen_enc);
7052     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
7053   }
7054 
7055   // Sign-extend to compute overflow detection mask.
7056   vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
7057 
7058   vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
7059   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
7060   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
7061 
7062   // Compose saturating min/max vector using first input polarity mask.
7063   vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
7064   vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
7065 
7066   // Blend result with saturating vector using overflow detection mask.
7067   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
7068 }
7069 
7070 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7071   switch(elem_bt) {
7072     case T_BYTE:
7073       if (ideal_opc == Op_SaturatingAddV) {
7074         vpaddsb(dst, src1, src2, vlen_enc);
7075       } else {
7076         assert(ideal_opc == Op_SaturatingSubV, "");
7077         vpsubsb(dst, src1, src2, vlen_enc);
7078       }
7079       break;
7080     case T_SHORT:
7081       if (ideal_opc == Op_SaturatingAddV) {
7082         vpaddsw(dst, src1, src2, vlen_enc);
7083       } else {
7084         assert(ideal_opc == Op_SaturatingSubV, "");
7085         vpsubsw(dst, src1, src2, vlen_enc);
7086       }
7087       break;
7088     default:
7089       fatal("Unsupported type %s", type2name(elem_bt));
7090       break;
7091   }
7092 }
7093 
7094 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7095   switch(elem_bt) {
7096     case T_BYTE:
7097       if (ideal_opc == Op_SaturatingAddV) {
7098         vpaddusb(dst, src1, src2, vlen_enc);
7099       } else {
7100         assert(ideal_opc == Op_SaturatingSubV, "");
7101         vpsubusb(dst, src1, src2, vlen_enc);
7102       }
7103       break;
7104     case T_SHORT:
7105       if (ideal_opc == Op_SaturatingAddV) {
7106         vpaddusw(dst, src1, src2, vlen_enc);
7107       } else {
7108         assert(ideal_opc == Op_SaturatingSubV, "");
7109         vpsubusw(dst, src1, src2, vlen_enc);
7110       }
7111       break;
7112     default:
7113       fatal("Unsupported type %s", type2name(elem_bt));
7114       break;
7115   }
7116 }
7117 
7118 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
7119                                                      XMMRegister src2, int vlen_enc) {
7120   switch(elem_bt) {
7121     case T_BYTE:
7122       evpermi2b(dst, src1, src2, vlen_enc);
7123       break;
7124     case T_SHORT:
7125       evpermi2w(dst, src1, src2, vlen_enc);
7126       break;
7127     case T_INT:
7128       evpermi2d(dst, src1, src2, vlen_enc);
7129       break;
7130     case T_LONG:
7131       evpermi2q(dst, src1, src2, vlen_enc);
7132       break;
7133     case T_FLOAT:
7134       evpermi2ps(dst, src1, src2, vlen_enc);
7135       break;
7136     case T_DOUBLE:
7137       evpermi2pd(dst, src1, src2, vlen_enc);
7138       break;
7139     default:
7140       fatal("Unsupported type %s", type2name(elem_bt));
7141       break;
7142   }
7143 }
7144 
7145 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
7146   if (is_unsigned) {
7147     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7148   } else {
7149     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7150   }
7151 }
7152 
7153 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
7154   if (is_unsigned) {
7155     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7156   } else {
7157     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7158   }
7159 }
7160 
7161 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
7162   switch(opcode) {
7163     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7164     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7165     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7166     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7167     default: assert(false, "%s", NodeClassNames[opcode]); break;
7168   }
7169 }
7170 
7171 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7172   switch(opcode) {
7173     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7174     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7175     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7176     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7177     default: assert(false, "%s", NodeClassNames[opcode]); break;
7178   }
7179 }
7180 
7181 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7182                                             KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) {
7183   vector_max_min_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit);
7184 }
7185 
7186 void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7187                                             KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
7188   if (opcode == Op_MaxVHF || opcode == Op_MaxHF) {
7189     // Move sign bits of src2 to mask register.
7190     evpmovw2m(ktmp, src2, vlen_enc);
7191     // xtmp1 = src2 < 0 ? src2 : src1
7192     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7193     // xtmp2 = src2 < 0 ? ? src1 : src2
7194     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7195     // Idea behind above swapping is to make seconds source operand a +ve value.
7196     // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in
7197     // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction,
7198     // the second source operand, either a NaN or a valid floating-point value, is returned
7199     // dst = max(xtmp1, xtmp2)
7200     evmaxph(dst, xtmp1, xtmp2, vlen_enc);
7201     // isNaN = is_unordered_quiet(xtmp1)
7202     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7203     // Final result is same as first source if its a NaN value,
7204     // in case second operand holds a NaN value then as per above semantics
7205     // result is same as second operand.
7206     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7207   } else {
7208     assert(opcode == Op_MinVHF || opcode == Op_MinHF, "");
7209     // Move sign bits of src1 to mask register.
7210     evpmovw2m(ktmp, src1, vlen_enc);
7211     // xtmp1 = src1 < 0 ? src2 : src1
7212     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7213     // xtmp2 = src1 < 0 ? src1 : src2
7214     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7215     // Idea behind above swapping is to make seconds source operand a -ve value.
7216     // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in
7217     // the second source operand is returned.
7218     // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN
7219     // or a valid floating-point value, is written to the result.
7220     // dst = min(xtmp1, xtmp2)
7221     evminph(dst, xtmp1, xtmp2, vlen_enc);
7222     // isNaN = is_unordered_quiet(xtmp1)
7223     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7224     // Final result is same as first source if its a NaN value,
7225     // in case second operand holds a NaN value then as per above semantics
7226     // result is same as second operand.
7227     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7228   }
7229 }