1 /*
   2  * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "gc/shared/barrierSet.hpp"
  28 #include "gc/shared/barrierSetAssembler.hpp"
  29 #include "oops/methodData.hpp"
  30 #include "opto/c2_MacroAssembler.hpp"
  31 #include "opto/intrinsicnode.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/opcodes.hpp"
  34 #include "opto/subnode.hpp"
  35 #include "runtime/globals.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/objectMonitorTable.hpp"
  38 #include "runtime/stubRoutines.hpp"
  39 #include "runtime/synchronizer.hpp"
  40 #include "utilities/checkedCast.hpp"
  41 #include "utilities/globalDefinitions.hpp"
  42 #include "utilities/powerOfTwo.hpp"
  43 #include "utilities/sizes.hpp"
  44 
  45 #ifdef PRODUCT
  46 #define BLOCK_COMMENT(str) /* nothing */
  47 #define STOP(error) stop(error)
  48 #else
  49 #define BLOCK_COMMENT(str) block_comment(str)
  50 #define STOP(error) block_comment(error); stop(error)
  51 #endif
  52 
  53 // C2 compiled method's prolog code.
  54 // Beware! This sp_inc is NOT the same as the one mentioned in MacroAssembler::remove_frame but only the size
  55 // of the extension space + the additional copy of the return address. That means, it doesn't contain the
  56 // frame size (where the local and sp_inc are) and the saved RBP.
  57 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) {
  58   if (C->clinit_barrier_on_entry()) {
  59     assert(VM_Version::supports_fast_class_init_checks(), "sanity");
  60     assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started");
  61 
  62     Label L_skip_barrier;
  63     Register klass = rscratch1;
  64 
  65     mov_metadata(klass, C->method()->holder()->constant_encoding());
  66     clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
  67 
  68     jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
  69 
  70     bind(L_skip_barrier);
  71   }
  72 
  73   int framesize = C->output()->frame_size_in_bytes();
  74   int bangsize = C->output()->bang_size_in_bytes();
  75   bool fp_mode_24b = false;
  76   int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0;
  77 
  78   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  79 
  80   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  81   // Remove word for return addr
  82   framesize -= wordSize;
  83   stack_bang_size -= wordSize;
  84 
  85   // Calls to C2R adapters often do not accept exceptional returns.
  86   // We require that their callers must bang for them.  But be careful, because
  87   // some VM calls (such as call site linkage) can use several kilobytes of
  88   // stack.  But the stack safety zone should account for that.
  89   // See bugs 4446381, 4468289, 4497237.
  90   if (stack_bang_size > 0) {
  91     generate_stack_overflow_check(stack_bang_size);
  92 
  93     // We always push rbp, so that on return to interpreter rbp, will be
  94     // restored correctly and we can correct the stack.
  95     push(rbp);
  96 #ifdef ASSERT
  97     if (sp_inc > 0) {
  98       movl(Address(rsp, 0), badRegWordVal);
  99       movl(Address(rsp, VMRegImpl::stack_slot_size), badRegWordVal);
 100     }
 101 #endif
 102     // Save caller's stack pointer into RBP if the frame pointer is preserved.
 103     if (PreserveFramePointer) {
 104       mov(rbp, rsp);
 105     }
 106     // Remove word for ebp
 107     framesize -= wordSize;
 108 
 109     // Create frame
 110     if (framesize) {
 111       subptr(rsp, framesize);
 112     }
 113   } else {
 114     subptr(rsp, framesize);
 115 
 116     // Save RBP register now.
 117     framesize -= wordSize;
 118     movptr(Address(rsp, framesize), rbp);
 119 #ifdef ASSERT
 120     if (sp_inc > 0) {
 121       movl(Address(rsp, framesize), badRegWordVal);
 122       movl(Address(rsp, framesize + VMRegImpl::stack_slot_size), badRegWordVal);
 123     }
 124 #endif
 125     // Save caller's stack pointer into RBP if the frame pointer is preserved.
 126     if (PreserveFramePointer) {
 127       movptr(rbp, rsp);
 128       if (framesize > 0) {
 129         addptr(rbp, framesize);
 130       }
 131     }
 132   }
 133 
 134   if (C->needs_stack_repair()) {
 135     // Save stack increment just below the saved rbp (also account for fixed framesize and rbp)
 136     assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned");
 137     movptr(Address(rsp, framesize - wordSize), sp_inc + framesize);
 138   }
 139 
 140   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 141     framesize -= wordSize;
 142     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 143   }
 144 
 145 #ifdef ASSERT
 146   if (VerifyStackAtCalls) {
 147     Label L;
 148     push(rax);
 149     mov(rax, rsp);
 150     andptr(rax, StackAlignmentInBytes-1);
 151     cmpptr(rax, StackAlignmentInBytes-wordSize);
 152     pop(rax);
 153     jcc(Assembler::equal, L);
 154     STOP("Stack is not properly aligned!");
 155     bind(L);
 156   }
 157 #endif
 158 }
 159 
 160 void C2_MacroAssembler::entry_barrier() {
 161   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 162   // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 163   Label dummy_slow_path;
 164   Label dummy_continuation;
 165   Label* slow_path = &dummy_slow_path;
 166   Label* continuation = &dummy_continuation;
 167   if (!Compile::current()->output()->in_scratch_emit_size()) {
 168     // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 169     C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 170     Compile::current()->output()->add_stub(stub);
 171     slow_path = &stub->entry();
 172     continuation = &stub->continuation();
 173   }
 174   bs->nmethod_entry_barrier(this, slow_path, continuation);
 175 }
 176 
 177 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 178   switch (vlen_in_bytes) {
 179     case  4: // fall-through
 180     case  8: // fall-through
 181     case 16: return Assembler::AVX_128bit;
 182     case 32: return Assembler::AVX_256bit;
 183     case 64: return Assembler::AVX_512bit;
 184 
 185     default: {
 186       ShouldNotReachHere();
 187       return Assembler::AVX_NoVec;
 188     }
 189   }
 190 }
 191 
 192 // fast_lock and fast_unlock used by C2
 193 
 194 // Because the transitions from emitted code to the runtime
 195 // monitorenter/exit helper stubs are so slow it's critical that
 196 // we inline both the lock-stack fast path and the inflated fast path.
 197 //
 198 // See also: cmpFastLock and cmpFastUnlock.
 199 //
 200 // What follows is a specialized inline transliteration of the code
 201 // in enter() and exit(). If we're concerned about I$ bloat another
 202 // option would be to emit TrySlowEnter and TrySlowExit methods
 203 // at startup-time.  These methods would accept arguments as
 204 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 205 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 206 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 207 // In practice, however, the # of lock sites is bounded and is usually small.
 208 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 209 // if the processor uses simple bimodal branch predictors keyed by EIP
 210 // Since the helper routines would be called from multiple synchronization
 211 // sites.
 212 //
 213 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 214 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 215 // to those specialized methods.  That'd give us a mostly platform-independent
 216 // implementation that the JITs could optimize and inline at their pleasure.
 217 // Done correctly, the only time we'd need to cross to native could would be
 218 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 219 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 220 // (b) explicit barriers or fence operations.
 221 //
 222 // TODO:
 223 //
 224 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 225 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 226 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 227 //    the lock operators would typically be faster than reifying Self.
 228 //
 229 // *  Ideally I'd define the primitives as:
 230 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 231 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 232 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 233 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 234 //    Furthermore the register assignments are overconstrained, possibly resulting in
 235 //    sub-optimal code near the synchronization site.
 236 //
 237 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 238 //    Alternately, use a better sp-proximity test.
 239 //
 240 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 241 //    Either one is sufficient to uniquely identify a thread.
 242 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 243 //
 244 // *  Intrinsify notify() and notifyAll() for the common cases where the
 245 //    object is locked by the calling thread but the waitlist is empty.
 246 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 247 //
 248 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 249 //    But beware of excessive branch density on AMD Opterons.
 250 //
 251 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 252 //    or failure of the fast path.  If the fast path fails then we pass
 253 //    control to the slow path, typically in C.  In fast_lock and
 254 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 255 //    will emit a conditional branch immediately after the node.
 256 //    So we have branches to branches and lots of ICC.ZF games.
 257 //    Instead, it might be better to have C2 pass a "FailureLabel"
 258 //    into fast_lock and fast_unlock.  In the case of success, control
 259 //    will drop through the node.  ICC.ZF is undefined at exit.
 260 //    In the case of failure, the node will branch directly to the
 261 //    FailureLabel
 262 
 263 // obj: object to lock
 264 // box: on-stack box address -- KILLED
 265 // rax: tmp -- KILLED
 266 // t  : tmp -- KILLED
 267 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register rax_reg,
 268                                   Register t, Register thread) {
 269   assert(rax_reg == rax, "Used for CAS");
 270   assert_different_registers(obj, box, rax_reg, t, thread);
 271 
 272   // Handle inflated monitor.
 273   Label inflated;
 274   // Finish fast lock successfully. ZF value is irrelevant.
 275   Label locked;
 276   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 277   Label slow_path;
 278 
 279   if (UseObjectMonitorTable) {
 280     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 281     movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
 282   }
 283 
 284   if (DiagnoseSyncOnValueBasedClasses != 0) {
 285     load_klass(rax_reg, obj, t);
 286     testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 287     jcc(Assembler::notZero, slow_path);
 288   }
 289 
 290   const Register mark = t;
 291 
 292   { // Fast Lock
 293 
 294     Label push;
 295 
 296     const Register top = UseObjectMonitorTable ? rax_reg : box;
 297 
 298     // Load the mark.
 299     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 300 
 301     // Prefetch top.
 302     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 303 
 304     // Check for monitor (0b10).
 305     testptr(mark, markWord::monitor_value);
 306     jcc(Assembler::notZero, inflated);
 307 
 308     // Check if lock-stack is full.
 309     cmpl(top, LockStack::end_offset() - 1);
 310     jcc(Assembler::greater, slow_path);
 311 
 312     // Check if recursive.
 313     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 314     jccb(Assembler::equal, push);
 315 
 316     // Try to lock. Transition lock bits 0b01 => 0b00
 317     movptr(rax_reg, mark);
 318     orptr(rax_reg, markWord::unlocked_value);
 319     andptr(mark, ~(int32_t)markWord::unlocked_value);
 320     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 321     jcc(Assembler::notEqual, slow_path);
 322 
 323     if (UseObjectMonitorTable) {
 324       // Need to reload top, clobbered by CAS.
 325       movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 326     }
 327     bind(push);
 328     // After successful lock, push object on lock-stack.
 329     movptr(Address(thread, top), obj);
 330     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 331     jmp(locked);
 332   }
 333 
 334   { // Handle inflated monitor.
 335     bind(inflated);
 336 
 337     const Register monitor = t;
 338 
 339     if (!UseObjectMonitorTable) {
 340       assert(mark == monitor, "should be the same here");
 341     } else {
 342       const Register hash = t;
 343       Label monitor_found;
 344 
 345       // Look for the monitor in the om_cache.
 346 
 347       ByteSize cache_offset   = JavaThread::om_cache_oops_offset();
 348       ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
 349       const int num_unrolled  = OMCache::CAPACITY;
 350       for (int i = 0; i < num_unrolled; i++) {
 351         movptr(monitor, Address(thread,  cache_offset + monitor_offset));
 352         cmpptr(obj, Address(thread, cache_offset));
 353         jccb(Assembler::equal, monitor_found);
 354         cache_offset = cache_offset + OMCache::oop_to_oop_difference();
 355       }
 356 
 357       // Look for the monitor in the table.
 358 
 359       // Get the hash code.
 360       movptr(hash, Address(obj, oopDesc::mark_offset_in_bytes()));
 361       shrq(hash, markWord::hash_shift);
 362       andq(hash, markWord::hash_mask);
 363 
 364       // Get the table and calculate the bucket's address.
 365       lea(rax_reg, ExternalAddress(ObjectMonitorTable::current_table_address()));
 366       movptr(rax_reg, Address(rax_reg));
 367       andq(hash, Address(rax_reg, ObjectMonitorTable::table_capacity_mask_offset()));
 368       movptr(rax_reg, Address(rax_reg, ObjectMonitorTable::table_buckets_offset()));
 369 
 370       // Read the monitor from the bucket.
 371       movptr(monitor, Address(rax_reg, hash, Address::times_ptr));
 372 
 373       // Check if the monitor in the bucket is special (empty, tombstone or removed)
 374       cmpptr(monitor, ObjectMonitorTable::SpecialPointerValues::below_is_special);
 375       jcc(Assembler::below, slow_path);
 376 
 377       // Check if object matches.
 378       movptr(rax_reg, Address(monitor, ObjectMonitor::object_offset()));
 379       BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 380       bs_asm->try_resolve_weak_handle_in_c2(this, rax_reg, slow_path);
 381       cmpptr(rax_reg, obj);
 382       jcc(Assembler::notEqual, slow_path);
 383 
 384       bind(monitor_found);
 385     }
 386     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 387     const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 388     const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
 389 
 390     Label monitor_locked;
 391     // Lock the monitor.
 392 
 393     if (UseObjectMonitorTable) {
 394       // Cache the monitor for unlock before trashing box. On failure to acquire
 395       // the lock, the slow path will reset the entry accordingly (see CacheSetter).
 396       movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
 397     }
 398 
 399     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 400     xorptr(rax_reg, rax_reg);
 401     movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
 402     lock(); cmpxchgptr(box, owner_address);
 403     jccb(Assembler::equal, monitor_locked);
 404 
 405     // Check if recursive.
 406     cmpptr(box, rax_reg);
 407     jccb(Assembler::notEqual, slow_path);
 408 
 409     // Recursive.
 410     increment(recursions_address);
 411 
 412     bind(monitor_locked);
 413   }
 414 
 415   bind(locked);
 416   // Set ZF = 1
 417   xorl(rax_reg, rax_reg);
 418 
 419 #ifdef ASSERT
 420   // Check that locked label is reached with ZF set.
 421   Label zf_correct;
 422   Label zf_bad_zero;
 423   jcc(Assembler::zero, zf_correct);
 424   jmp(zf_bad_zero);
 425 #endif
 426 
 427   bind(slow_path);
 428 #ifdef ASSERT
 429   // Check that slow_path label is reached with ZF not set.
 430   jcc(Assembler::notZero, zf_correct);
 431   stop("Fast Lock ZF != 0");
 432   bind(zf_bad_zero);
 433   stop("Fast Lock ZF != 1");
 434   bind(zf_correct);
 435 #endif
 436   // C2 uses the value of ZF to determine the continuation.
 437 }
 438 
 439 // obj: object to lock
 440 // rax: tmp -- KILLED
 441 // t  : tmp - cannot be obj nor rax -- KILLED
 442 //
 443 // Some commentary on balanced locking:
 444 //
 445 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 446 // Methods that don't have provably balanced locking are forced to run in the
 447 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 448 // The interpreter provides two properties:
 449 // I1:  At return-time the interpreter automatically and quietly unlocks any
 450 //      objects acquired in the current activation (frame).  Recall that the
 451 //      interpreter maintains an on-stack list of locks currently held by
 452 //      a frame.
 453 // I2:  If a method attempts to unlock an object that is not held by the
 454 //      frame the interpreter throws IMSX.
 455 //
 456 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 457 // B() doesn't have provably balanced locking so it runs in the interpreter.
 458 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 459 // is still locked by A().
 460 //
 461 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface
 462 // Specification" states that an object locked by JNI's MonitorEnter should not be
 463 // unlocked by "normal" java-level locking and vice-versa.  The specification doesn't
 464 // specify what will occur if a program engages in such mixed-mode locking, however.
 465 // Arguably given that the spec legislates the JNI case as undefined our implementation
 466 // could reasonably *avoid* checking owner in fast_unlock().
 467 // In the interest of performance we elide m->Owner==Self check in unlock.
 468 // A perfectly viable alternative is to elide the owner check except when
 469 // Xcheck:jni is enabled.
 470 
 471 void C2_MacroAssembler::fast_unlock(Register obj, Register reg_rax, Register t, Register thread) {
 472   assert(reg_rax == rax, "Used for CAS");
 473   assert_different_registers(obj, reg_rax, t);
 474 
 475   // Handle inflated monitor.
 476   Label inflated, inflated_check_lock_stack;
 477   // Finish fast unlock successfully.  MUST jump with ZF == 1
 478   Label unlocked, slow_path;
 479 
 480   const Register mark = t;
 481   const Register monitor = t;
 482   const Register top = UseObjectMonitorTable ? t : reg_rax;
 483   const Register box = reg_rax;
 484 
 485   Label dummy;
 486   C2FastUnlockStub* stub = nullptr;
 487 
 488   if (!Compile::current()->output()->in_scratch_emit_size()) {
 489     stub = new (Compile::current()->comp_arena()) C2FastUnlockStub(obj, mark, reg_rax, thread);
 490     Compile::current()->output()->add_stub(stub);
 491   }
 492 
 493   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 494 
 495   { // Fast Unlock
 496 
 497     // Load top.
 498     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 499 
 500     if (!UseObjectMonitorTable) {
 501       // Prefetch mark.
 502       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 503     }
 504 
 505     // Check if obj is top of lock-stack.
 506     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 507     // Top of lock stack was not obj. Must be monitor.
 508     jcc(Assembler::notEqual, inflated_check_lock_stack);
 509 
 510     // Pop lock-stack.
 511     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 512     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 513 
 514     // Check if recursive.
 515     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 516     jcc(Assembler::equal, unlocked);
 517 
 518     // We elide the monitor check, let the CAS fail instead.
 519 
 520     if (UseObjectMonitorTable) {
 521       // Load mark.
 522       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 523     }
 524 
 525     // Try to unlock. Transition lock bits 0b00 => 0b01
 526     movptr(reg_rax, mark);
 527     andptr(reg_rax, ~(int32_t)markWord::lock_mask_in_place);
 528     orptr(mark, markWord::unlocked_value);
 529     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 530     jcc(Assembler::notEqual, push_and_slow_path);
 531     jmp(unlocked);
 532   }
 533 
 534 
 535   { // Handle inflated monitor.
 536     bind(inflated_check_lock_stack);
 537 #ifdef ASSERT
 538     Label check_done;
 539     subl(top, oopSize);
 540     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
 541     jcc(Assembler::below, check_done);
 542     cmpptr(obj, Address(thread, top));
 543     jcc(Assembler::notEqual, inflated_check_lock_stack);
 544     stop("Fast Unlock lock on stack");
 545     bind(check_done);
 546     if (UseObjectMonitorTable) {
 547       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 548     }
 549     testptr(mark, markWord::monitor_value);
 550     jcc(Assembler::notZero, inflated);
 551     stop("Fast Unlock not monitor");
 552 #endif
 553 
 554     bind(inflated);
 555 
 556     if (!UseObjectMonitorTable) {
 557       assert(mark == monitor, "should be the same here");
 558     } else {
 559       // Uses ObjectMonitorTable.  Look for the monitor in our BasicLock on the stack.
 560       movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 561       // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
 562       cmpptr(monitor, alignof(ObjectMonitor*));
 563       jcc(Assembler::below, slow_path);
 564     }
 565     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 566     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
 567     const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
 568     const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag};
 569     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
 570 
 571     Label recursive;
 572 
 573     // Check if recursive.
 574     cmpptr(recursions_address, 0);
 575     jcc(Assembler::notZero, recursive);
 576 
 577     // Set owner to null.
 578     // Release to satisfy the JMM
 579     movptr(owner_address, NULL_WORD);
 580     // We need a full fence after clearing owner to avoid stranding.
 581     // StoreLoad achieves this.
 582     membar(StoreLoad);
 583 
 584     // Check if the entry_list is empty.
 585     cmpptr(entry_list_address, NULL_WORD);
 586     jcc(Assembler::zero, unlocked);    // If so we are done.
 587 
 588     // Check if there is a successor.
 589     cmpptr(succ_address, NULL_WORD);
 590     jcc(Assembler::notZero, unlocked); // If so we are done.
 591 
 592     // Save the monitor pointer in the current thread, so we can try to
 593     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 594     if (!UseObjectMonitorTable) {
 595       andptr(monitor, ~(int32_t)markWord::monitor_value);
 596     }
 597     movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
 598 
 599     orl(t, 1); // Fast Unlock ZF = 0
 600     jmpb(slow_path);
 601 
 602     // Recursive unlock.
 603     bind(recursive);
 604     decrement(recursions_address);
 605   }
 606 
 607   bind(unlocked);
 608   xorl(t, t); // Fast Unlock ZF = 1
 609 
 610 #ifdef ASSERT
 611   // Check that unlocked label is reached with ZF set.
 612   Label zf_correct;
 613   Label zf_bad_zero;
 614   jcc(Assembler::zero, zf_correct);
 615   jmp(zf_bad_zero);
 616 #endif
 617 
 618   bind(slow_path);
 619   if (stub != nullptr) {
 620     bind(stub->slow_path_continuation());
 621   }
 622 #ifdef ASSERT
 623   // Check that stub->continuation() label is reached with ZF not set.
 624   jcc(Assembler::notZero, zf_correct);
 625   stop("Fast Unlock ZF != 0");
 626   bind(zf_bad_zero);
 627   stop("Fast Unlock ZF != 1");
 628   bind(zf_correct);
 629 #endif
 630   // C2 uses the value of ZF to determine the continuation.
 631 }
 632 
 633 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
 634   fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
 635 }
 636 
 637 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) {
 638   const int framesize = Compile::current()->output()->frame_size_in_bytes();
 639   masm->movptr(dst, rsp);
 640   if (framesize > 2 * wordSize) {
 641     masm->addptr(dst, framesize - 2 * wordSize);
 642   }
 643 }
 644 
 645 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
 646   if (PreserveFramePointer) {
 647     // frame pointer is valid
 648 #ifdef ASSERT
 649     // Verify frame pointer value in rbp.
 650     reconstruct_frame_pointer_helper(this, rtmp);
 651     Label L_success;
 652     cmpq(rbp, rtmp);
 653     jccb(Assembler::equal, L_success);
 654     STOP("frame pointer mismatch");
 655     bind(L_success);
 656 #endif // ASSERT
 657   } else {
 658     reconstruct_frame_pointer_helper(this, rbp);
 659   }
 660 }
 661 
 662 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) {
 663   jint lo = t->_lo;
 664   jint hi = t->_hi;
 665   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi);
 666   if (t == TypeInt::INT) {
 667     return;
 668   }
 669 
 670   BLOCK_COMMENT("CastII {");
 671   Label fail;
 672   Label succeed;
 673 
 674   if (lo != min_jint) {
 675     cmpl(val, lo);
 676     jccb(Assembler::less, fail);
 677   }
 678   if (hi != max_jint) {
 679     cmpl(val, hi);
 680     jccb(Assembler::greater, fail);
 681   }
 682   jmpb(succeed);
 683 
 684   bind(fail);
 685   movl(c_rarg0, idx);
 686   movl(c_rarg1, val);
 687   movl(c_rarg2, lo);
 688   movl(c_rarg3, hi);
 689   reconstruct_frame_pointer(rscratch1);
 690   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range)));
 691   hlt();
 692   bind(succeed);
 693   BLOCK_COMMENT("} // CastII");
 694 }
 695 
 696 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
 697   fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
 698 }
 699 
 700 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) {
 701   jlong lo = t->_lo;
 702   jlong hi = t->_hi;
 703   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi);
 704   if (t == TypeLong::LONG) {
 705     return;
 706   }
 707 
 708   BLOCK_COMMENT("CastLL {");
 709   Label fail;
 710   Label succeed;
 711 
 712   auto cmp_val = [&](jlong bound) {
 713     if (is_simm32(bound)) {
 714       cmpq(val, checked_cast<int>(bound));
 715     } else {
 716       mov64(tmp, bound);
 717       cmpq(val, tmp);
 718     }
 719   };
 720 
 721   if (lo != min_jlong) {
 722     cmp_val(lo);
 723     jccb(Assembler::less, fail);
 724   }
 725   if (hi != max_jlong) {
 726     cmp_val(hi);
 727     jccb(Assembler::greater, fail);
 728   }
 729   jmpb(succeed);
 730 
 731   bind(fail);
 732   movl(c_rarg0, idx);
 733   movq(c_rarg1, val);
 734   mov64(c_rarg2, lo);
 735   mov64(c_rarg3, hi);
 736   reconstruct_frame_pointer(rscratch1);
 737   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range)));
 738   hlt();
 739   bind(succeed);
 740   BLOCK_COMMENT("} // CastLL");
 741 }
 742 
 743 //-------------------------------------------------------------------------------------------
 744 // Generic instructions support for use in .ad files C2 code generation
 745 
 746 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 747   if (dst != src) {
 748     movdqu(dst, src);
 749   }
 750   if (opcode == Op_AbsVD) {
 751     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 752   } else {
 753     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 754     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 755   }
 756 }
 757 
 758 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 759   if (opcode == Op_AbsVD) {
 760     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 761   } else {
 762     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 763     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 764   }
 765 }
 766 
 767 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 768   if (dst != src) {
 769     movdqu(dst, src);
 770   }
 771   if (opcode == Op_AbsVF) {
 772     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 773   } else {
 774     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 775     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 776   }
 777 }
 778 
 779 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 780   if (opcode == Op_AbsVF) {
 781     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 782   } else {
 783     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 784     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 785   }
 786 }
 787 
 788 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 789   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 790   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 791 
 792   if (opcode == Op_MinV) {
 793     if (elem_bt == T_BYTE) {
 794       pminsb(dst, src);
 795     } else if (elem_bt == T_SHORT) {
 796       pminsw(dst, src);
 797     } else if (elem_bt == T_INT) {
 798       pminsd(dst, src);
 799     } else {
 800       assert(elem_bt == T_LONG, "required");
 801       assert(tmp == xmm0, "required");
 802       assert_different_registers(dst, src, tmp);
 803       movdqu(xmm0, dst);
 804       pcmpgtq(xmm0, src);
 805       blendvpd(dst, src);  // xmm0 as mask
 806     }
 807   } else { // opcode == Op_MaxV
 808     if (elem_bt == T_BYTE) {
 809       pmaxsb(dst, src);
 810     } else if (elem_bt == T_SHORT) {
 811       pmaxsw(dst, src);
 812     } else if (elem_bt == T_INT) {
 813       pmaxsd(dst, src);
 814     } else {
 815       assert(elem_bt == T_LONG, "required");
 816       assert(tmp == xmm0, "required");
 817       assert_different_registers(dst, src, tmp);
 818       movdqu(xmm0, src);
 819       pcmpgtq(xmm0, dst);
 820       blendvpd(dst, src);  // xmm0 as mask
 821     }
 822   }
 823 }
 824 
 825 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 826                                   XMMRegister src1, Address src2, int vlen_enc) {
 827   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 828   if (opcode == Op_UMinV) {
 829     switch(elem_bt) {
 830       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 831       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 832       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 833       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 834       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 835     }
 836   } else {
 837     assert(opcode == Op_UMaxV, "required");
 838     switch(elem_bt) {
 839       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 840       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 841       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 842       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 843       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 844     }
 845   }
 846 }
 847 
 848 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
 849   // For optimality, leverage a full vector width of 512 bits
 850   // for operations over smaller vector sizes on AVX512 targets.
 851   if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 852     if (opcode == Op_UMaxV) {
 853       evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 854     } else {
 855       assert(opcode == Op_UMinV, "required");
 856       evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 857     }
 858   } else {
 859     // T1 = -1
 860     vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
 861     // T1 = -1 << 63
 862     vpsllq(xtmp1, xtmp1, 63, vlen_enc);
 863     // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
 864     vpaddq(xtmp2, xtmp1, src2, vlen_enc);
 865     // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
 866     vpaddq(xtmp1, xtmp1, src1, vlen_enc);
 867     // Mask = T2 > T1
 868     vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
 869     if (opcode == Op_UMaxV) {
 870       // Res = Mask ? Src2 : Src1
 871       vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
 872     } else {
 873       // Res = Mask ? Src1 : Src2
 874       vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
 875     }
 876   }
 877 }
 878 
 879 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 880                                   XMMRegister src1, XMMRegister src2, int vlen_enc) {
 881   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 882   if (opcode == Op_UMinV) {
 883     switch(elem_bt) {
 884       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 885       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 886       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 887       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 888       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 889     }
 890   } else {
 891     assert(opcode == Op_UMaxV, "required");
 892     switch(elem_bt) {
 893       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 894       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 895       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 896       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 897       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 898     }
 899   }
 900 }
 901 
 902 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 903                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 904                                  int vlen_enc) {
 905   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 906 
 907   if (opcode == Op_MinV) {
 908     if (elem_bt == T_BYTE) {
 909       vpminsb(dst, src1, src2, vlen_enc);
 910     } else if (elem_bt == T_SHORT) {
 911       vpminsw(dst, src1, src2, vlen_enc);
 912     } else if (elem_bt == T_INT) {
 913       vpminsd(dst, src1, src2, vlen_enc);
 914     } else {
 915       assert(elem_bt == T_LONG, "required");
 916       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 917         vpminsq(dst, src1, src2, vlen_enc);
 918       } else {
 919         assert_different_registers(dst, src1, src2);
 920         vpcmpgtq(dst, src1, src2, vlen_enc);
 921         vblendvpd(dst, src1, src2, dst, vlen_enc);
 922       }
 923     }
 924   } else { // opcode == Op_MaxV
 925     if (elem_bt == T_BYTE) {
 926       vpmaxsb(dst, src1, src2, vlen_enc);
 927     } else if (elem_bt == T_SHORT) {
 928       vpmaxsw(dst, src1, src2, vlen_enc);
 929     } else if (elem_bt == T_INT) {
 930       vpmaxsd(dst, src1, src2, vlen_enc);
 931     } else {
 932       assert(elem_bt == T_LONG, "required");
 933       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 934         vpmaxsq(dst, src1, src2, vlen_enc);
 935       } else {
 936         assert_different_registers(dst, src1, src2);
 937         vpcmpgtq(dst, src1, src2, vlen_enc);
 938         vblendvpd(dst, src2, src1, dst, vlen_enc);
 939       }
 940     }
 941   }
 942 }
 943 
 944 // Float/Double min max
 945 
 946 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
 947                                    XMMRegister dst, XMMRegister a, XMMRegister b,
 948                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
 949                                    int vlen_enc) {
 950   assert(UseAVX > 0, "required");
 951   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 952          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 953   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 954   assert_different_registers(a, tmp, atmp, btmp);
 955   assert_different_registers(b, tmp, atmp, btmp);
 956 
 957   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 958   bool is_double_word = is_double_word_type(elem_bt);
 959 
 960   /* Note on 'non-obvious' assembly sequence:
 961    *
 962    * While there are vminps/vmaxps instructions, there are two important differences between hardware
 963    * and Java on how they handle floats:
 964    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
 965    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
 966    *
 967    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
 968    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
 969    *                (only useful when signs differ, noop otherwise)
 970    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
 971 
 972    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
 973    *   btmp = (b < +0.0) ? a : b
 974    *   atmp = (b < +0.0) ? b : a
 975    *   Tmp  = Max_Float(atmp , btmp)
 976    *   Res  = (atmp == NaN) ? atmp : Tmp
 977    */
 978 
 979   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
 980   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
 981   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
 982   XMMRegister mask;
 983 
 984   if (!is_double_word && is_min) {
 985     mask = a;
 986     vblend = &MacroAssembler::vblendvps;
 987     vmaxmin = &MacroAssembler::vminps;
 988     vcmp = &MacroAssembler::vcmpps;
 989   } else if (!is_double_word && !is_min) {
 990     mask = b;
 991     vblend = &MacroAssembler::vblendvps;
 992     vmaxmin = &MacroAssembler::vmaxps;
 993     vcmp = &MacroAssembler::vcmpps;
 994   } else if (is_double_word && is_min) {
 995     mask = a;
 996     vblend = &MacroAssembler::vblendvpd;
 997     vmaxmin = &MacroAssembler::vminpd;
 998     vcmp = &MacroAssembler::vcmppd;
 999   } else {
1000     assert(is_double_word && !is_min, "sanity");
1001     mask = b;
1002     vblend = &MacroAssembler::vblendvpd;
1003     vmaxmin = &MacroAssembler::vmaxpd;
1004     vcmp = &MacroAssembler::vcmppd;
1005   }
1006 
1007   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
1008   XMMRegister maxmin, scratch;
1009   if (dst == btmp) {
1010     maxmin = btmp;
1011     scratch = tmp;
1012   } else {
1013     maxmin = tmp;
1014     scratch = btmp;
1015   }
1016 
1017   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1018   if (precompute_mask && !is_double_word) {
1019     vpsrad(tmp, mask, 32, vlen_enc);
1020     mask = tmp;
1021   } else if (precompute_mask && is_double_word) {
1022     vpxor(tmp, tmp, tmp, vlen_enc);
1023     vpcmpgtq(tmp, tmp, mask, vlen_enc);
1024     mask = tmp;
1025   }
1026 
1027   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1028   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1029   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1030   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1031   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1032 }
1033 
1034 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1035                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1036                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1037                                     int vlen_enc) {
1038   assert(UseAVX > 2, "required");
1039   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1040          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1041   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1042   assert_different_registers(dst, a, atmp, btmp);
1043   assert_different_registers(dst, b, atmp, btmp);
1044 
1045   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1046   bool is_double_word = is_double_word_type(elem_bt);
1047   bool merge = true;
1048 
1049   if (!is_double_word && is_min) {
1050     evpmovd2m(ktmp, a, vlen_enc);
1051     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1052     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1053     vminps(dst, atmp, btmp, vlen_enc);
1054     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1055     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1056   } else if (!is_double_word && !is_min) {
1057     evpmovd2m(ktmp, b, vlen_enc);
1058     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1059     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1060     vmaxps(dst, atmp, btmp, vlen_enc);
1061     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1062     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1063   } else if (is_double_word && is_min) {
1064     evpmovq2m(ktmp, a, vlen_enc);
1065     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1066     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1067     vminpd(dst, atmp, btmp, vlen_enc);
1068     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1069     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1070   } else {
1071     assert(is_double_word && !is_min, "sanity");
1072     evpmovq2m(ktmp, b, vlen_enc);
1073     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1074     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1075     vmaxpd(dst, atmp, btmp, vlen_enc);
1076     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1077     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1078   }
1079 }
1080 
1081 void C2_MacroAssembler::vminmax_fp_avx10_2(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1082                                            XMMRegister src1, XMMRegister src2, int vlen_enc) {
1083   assert(opc == Op_MinV || opc == Op_MinReductionV ||
1084          opc == Op_MaxV || opc == Op_MaxReductionV, "sanity");
1085 
1086   int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_2_MINMAX_MIN_COMPARE_SIGN
1087                                                          : AVX10_2_MINMAX_MAX_COMPARE_SIGN;
1088   if (elem_bt == T_FLOAT) {
1089     evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc);
1090   } else {
1091     assert(elem_bt == T_DOUBLE, "");
1092     evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc);
1093   }
1094 }
1095 
1096 void C2_MacroAssembler::sminmax_fp_avx10_2(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1097                                            XMMRegister src1, XMMRegister src2) {
1098   assert(opc == Op_MinF || opc == Op_MaxF ||
1099          opc == Op_MinD || opc == Op_MaxD, "sanity");
1100 
1101   int imm8 = (opc == Op_MinF || opc == Op_MinD) ? AVX10_2_MINMAX_MIN_COMPARE_SIGN
1102                                                 : AVX10_2_MINMAX_MAX_COMPARE_SIGN;
1103   if (elem_bt == T_FLOAT) {
1104     evminmaxss(dst, mask, src1, src2, true, imm8);
1105   } else {
1106     assert(elem_bt == T_DOUBLE, "");
1107     evminmaxsd(dst, mask, src1, src2, true, imm8);
1108   }
1109 }
1110 
1111 // Float/Double signum
1112 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1113   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1114 
1115   Label DONE_LABEL;
1116 
1117   // Handle special cases +0.0/-0.0 and NaN, if argument is +0.0/-0.0 or NaN, return argument
1118   // If AVX10.2 (or newer) floating point comparison instructions used, SF=1 for equal and unordered cases
1119   // If other floating point comparison instructions used, ZF=1 for equal and unordered cases
1120   if (opcode == Op_SignumF) {
1121     if (VM_Version::supports_avx10_2()) {
1122       evucomxss(dst, zero);
1123       jcc(Assembler::negative, DONE_LABEL);
1124     } else {
1125       ucomiss(dst, zero);
1126       jcc(Assembler::equal, DONE_LABEL);
1127     }
1128     movflt(dst, one);
1129     jcc(Assembler::above, DONE_LABEL);
1130     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1131   } else if (opcode == Op_SignumD) {
1132     if (VM_Version::supports_avx10_2()) {
1133       evucomxsd(dst, zero);
1134       jcc(Assembler::negative, DONE_LABEL);
1135     } else {
1136       ucomisd(dst, zero);
1137       jcc(Assembler::equal, DONE_LABEL);
1138     }
1139     movdbl(dst, one);
1140     jcc(Assembler::above, DONE_LABEL);
1141     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1142   }
1143 
1144   bind(DONE_LABEL);
1145 }
1146 
1147 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1148   if (sign) {
1149     pmovsxbw(dst, src);
1150   } else {
1151     pmovzxbw(dst, src);
1152   }
1153 }
1154 
1155 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1156   if (sign) {
1157     vpmovsxbw(dst, src, vector_len);
1158   } else {
1159     vpmovzxbw(dst, src, vector_len);
1160   }
1161 }
1162 
1163 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1164   if (sign) {
1165     vpmovsxbd(dst, src, vector_len);
1166   } else {
1167     vpmovzxbd(dst, src, vector_len);
1168   }
1169 }
1170 
1171 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1172   if (sign) {
1173     vpmovsxwd(dst, src, vector_len);
1174   } else {
1175     vpmovzxwd(dst, src, vector_len);
1176   }
1177 }
1178 
1179 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1180                                      int shift, int vector_len) {
1181   if (opcode == Op_RotateLeftV) {
1182     if (etype == T_INT) {
1183       evprold(dst, src, shift, vector_len);
1184     } else {
1185       assert(etype == T_LONG, "expected type T_LONG");
1186       evprolq(dst, src, shift, vector_len);
1187     }
1188   } else {
1189     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1190     if (etype == T_INT) {
1191       evprord(dst, src, shift, vector_len);
1192     } else {
1193       assert(etype == T_LONG, "expected type T_LONG");
1194       evprorq(dst, src, shift, vector_len);
1195     }
1196   }
1197 }
1198 
1199 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1200                                      XMMRegister shift, int vector_len) {
1201   if (opcode == Op_RotateLeftV) {
1202     if (etype == T_INT) {
1203       evprolvd(dst, src, shift, vector_len);
1204     } else {
1205       assert(etype == T_LONG, "expected type T_LONG");
1206       evprolvq(dst, src, shift, vector_len);
1207     }
1208   } else {
1209     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1210     if (etype == T_INT) {
1211       evprorvd(dst, src, shift, vector_len);
1212     } else {
1213       assert(etype == T_LONG, "expected type T_LONG");
1214       evprorvq(dst, src, shift, vector_len);
1215     }
1216   }
1217 }
1218 
1219 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1220   if (opcode == Op_RShiftVI) {
1221     psrad(dst, shift);
1222   } else if (opcode == Op_LShiftVI) {
1223     pslld(dst, shift);
1224   } else {
1225     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1226     psrld(dst, shift);
1227   }
1228 }
1229 
1230 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1231   switch (opcode) {
1232     case Op_RShiftVI:  psrad(dst, shift); break;
1233     case Op_LShiftVI:  pslld(dst, shift); break;
1234     case Op_URShiftVI: psrld(dst, shift); break;
1235 
1236     default: assert(false, "%s", NodeClassNames[opcode]);
1237   }
1238 }
1239 
1240 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1241   if (opcode == Op_RShiftVI) {
1242     vpsrad(dst, nds, shift, vector_len);
1243   } else if (opcode == Op_LShiftVI) {
1244     vpslld(dst, nds, shift, vector_len);
1245   } else {
1246     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1247     vpsrld(dst, nds, shift, vector_len);
1248   }
1249 }
1250 
1251 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1252   switch (opcode) {
1253     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1254     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1255     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1256 
1257     default: assert(false, "%s", NodeClassNames[opcode]);
1258   }
1259 }
1260 
1261 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1262   switch (opcode) {
1263     case Op_RShiftVB:  // fall-through
1264     case Op_RShiftVS:  psraw(dst, shift); break;
1265 
1266     case Op_LShiftVB:  // fall-through
1267     case Op_LShiftVS:  psllw(dst, shift);   break;
1268 
1269     case Op_URShiftVS: // fall-through
1270     case Op_URShiftVB: psrlw(dst, shift);  break;
1271 
1272     default: assert(false, "%s", NodeClassNames[opcode]);
1273   }
1274 }
1275 
1276 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1277   switch (opcode) {
1278     case Op_RShiftVB:  // fall-through
1279     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1280 
1281     case Op_LShiftVB:  // fall-through
1282     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1283 
1284     case Op_URShiftVS: // fall-through
1285     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1286 
1287     default: assert(false, "%s", NodeClassNames[opcode]);
1288   }
1289 }
1290 
1291 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1292   switch (opcode) {
1293     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1294     case Op_LShiftVL:  psllq(dst, shift); break;
1295     case Op_URShiftVL: psrlq(dst, shift); break;
1296 
1297     default: assert(false, "%s", NodeClassNames[opcode]);
1298   }
1299 }
1300 
1301 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1302   if (opcode == Op_RShiftVL) {
1303     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1304   } else if (opcode == Op_LShiftVL) {
1305     psllq(dst, shift);
1306   } else {
1307     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1308     psrlq(dst, shift);
1309   }
1310 }
1311 
1312 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1313   switch (opcode) {
1314     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1315     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1316     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1317 
1318     default: assert(false, "%s", NodeClassNames[opcode]);
1319   }
1320 }
1321 
1322 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1323   if (opcode == Op_RShiftVL) {
1324     evpsraq(dst, nds, shift, vector_len);
1325   } else if (opcode == Op_LShiftVL) {
1326     vpsllq(dst, nds, shift, vector_len);
1327   } else {
1328     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1329     vpsrlq(dst, nds, shift, vector_len);
1330   }
1331 }
1332 
1333 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1334   switch (opcode) {
1335     case Op_RShiftVB:  // fall-through
1336     case Op_RShiftVS:  // fall-through
1337     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1338 
1339     case Op_LShiftVB:  // fall-through
1340     case Op_LShiftVS:  // fall-through
1341     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1342 
1343     case Op_URShiftVB: // fall-through
1344     case Op_URShiftVS: // fall-through
1345     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1346 
1347     default: assert(false, "%s", NodeClassNames[opcode]);
1348   }
1349 }
1350 
1351 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1352   switch (opcode) {
1353     case Op_RShiftVB:  // fall-through
1354     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1355 
1356     case Op_LShiftVB:  // fall-through
1357     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1358 
1359     case Op_URShiftVB: // fall-through
1360     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1361 
1362     default: assert(false, "%s", NodeClassNames[opcode]);
1363   }
1364 }
1365 
1366 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1367   assert(UseAVX >= 2, "required");
1368   switch (opcode) {
1369     case Op_RShiftVL: {
1370       if (UseAVX > 2) {
1371         assert(tmp == xnoreg, "not used");
1372         if (!VM_Version::supports_avx512vl()) {
1373           vlen_enc = Assembler::AVX_512bit;
1374         }
1375         evpsravq(dst, src, shift, vlen_enc);
1376       } else {
1377         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1378         vpsrlvq(dst, src, shift, vlen_enc);
1379         vpsrlvq(tmp, tmp, shift, vlen_enc);
1380         vpxor(dst, dst, tmp, vlen_enc);
1381         vpsubq(dst, dst, tmp, vlen_enc);
1382       }
1383       break;
1384     }
1385     case Op_LShiftVL: {
1386       assert(tmp == xnoreg, "not used");
1387       vpsllvq(dst, src, shift, vlen_enc);
1388       break;
1389     }
1390     case Op_URShiftVL: {
1391       assert(tmp == xnoreg, "not used");
1392       vpsrlvq(dst, src, shift, vlen_enc);
1393       break;
1394     }
1395     default: assert(false, "%s", NodeClassNames[opcode]);
1396   }
1397 }
1398 
1399 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1400 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1401   assert(opcode == Op_LShiftVB ||
1402          opcode == Op_RShiftVB ||
1403          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1404   bool sign = (opcode != Op_URShiftVB);
1405   assert(vector_len == 0, "required");
1406   vextendbd(sign, dst, src, 1);
1407   vpmovzxbd(vtmp, shift, 1);
1408   varshiftd(opcode, dst, dst, vtmp, 1);
1409   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1410   vextracti128_high(vtmp, dst);
1411   vpackusdw(dst, dst, vtmp, 0);
1412 }
1413 
1414 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1415 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1416   assert(opcode == Op_LShiftVB ||
1417          opcode == Op_RShiftVB ||
1418          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1419   bool sign = (opcode != Op_URShiftVB);
1420   int ext_vector_len = vector_len + 1;
1421   vextendbw(sign, dst, src, ext_vector_len);
1422   vpmovzxbw(vtmp, shift, ext_vector_len);
1423   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1424   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1425   if (vector_len == 0) {
1426     vextracti128_high(vtmp, dst);
1427     vpackuswb(dst, dst, vtmp, vector_len);
1428   } else {
1429     vextracti64x4_high(vtmp, dst);
1430     vpackuswb(dst, dst, vtmp, vector_len);
1431     vpermq(dst, dst, 0xD8, vector_len);
1432   }
1433 }
1434 
1435 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1436   switch(typ) {
1437     case T_BYTE:
1438       pinsrb(dst, val, idx);
1439       break;
1440     case T_SHORT:
1441       pinsrw(dst, val, idx);
1442       break;
1443     case T_INT:
1444       pinsrd(dst, val, idx);
1445       break;
1446     case T_LONG:
1447       pinsrq(dst, val, idx);
1448       break;
1449     default:
1450       assert(false,"Should not reach here.");
1451       break;
1452   }
1453 }
1454 
1455 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1456   switch(typ) {
1457     case T_BYTE:
1458       vpinsrb(dst, src, val, idx);
1459       break;
1460     case T_SHORT:
1461       vpinsrw(dst, src, val, idx);
1462       break;
1463     case T_INT:
1464       vpinsrd(dst, src, val, idx);
1465       break;
1466     case T_LONG:
1467       vpinsrq(dst, src, val, idx);
1468       break;
1469     default:
1470       assert(false,"Should not reach here.");
1471       break;
1472   }
1473 }
1474 
1475 void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst,
1476                                          Register base, Register idx_base,
1477                                          Register mask, Register mask_idx,
1478                                          Register rtmp, int vlen_enc) {
1479   vpxor(dst, dst, dst, vlen_enc);
1480   if (elem_bt == T_SHORT) {
1481     for (int i = 0; i < 4; i++) {
1482       // dst[i] = mask[i] ? src[idx_base[i]] : 0
1483       Label skip_load;
1484       btq(mask, mask_idx);
1485       jccb(Assembler::carryClear, skip_load);
1486       movl(rtmp, Address(idx_base, i * 4));
1487       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1488       bind(skip_load);
1489       incq(mask_idx);
1490     }
1491   } else {
1492     assert(elem_bt == T_BYTE, "");
1493     for (int i = 0; i < 8; i++) {
1494       // dst[i] = mask[i] ? src[idx_base[i]] : 0
1495       Label skip_load;
1496       btq(mask, mask_idx);
1497       jccb(Assembler::carryClear, skip_load);
1498       movl(rtmp, Address(idx_base, i * 4));
1499       pinsrb(dst, Address(base, rtmp), i);
1500       bind(skip_load);
1501       incq(mask_idx);
1502     }
1503   }
1504 }
1505 
1506 void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst,
1507                                   Register base, Register idx_base,
1508                                   Register rtmp, int vlen_enc) {
1509   vpxor(dst, dst, dst, vlen_enc);
1510   if (elem_bt == T_SHORT) {
1511     for (int i = 0; i < 4; i++) {
1512       // dst[i] = src[idx_base[i]]
1513       movl(rtmp, Address(idx_base, i * 4));
1514       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1515     }
1516   } else {
1517     assert(elem_bt == T_BYTE, "");
1518     for (int i = 0; i < 8; i++) {
1519       // dst[i] = src[idx_base[i]]
1520       movl(rtmp, Address(idx_base, i * 4));
1521       pinsrb(dst, Address(base, rtmp), i);
1522     }
1523   }
1524 }
1525 
1526 /*
1527  * Gather using hybrid algorithm, first partially unroll scalar loop
1528  * to accumulate values from gather indices into a quad-word(64bit) slice.
1529  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1530  * permutation to place the slice into appropriate vector lane
1531  * locations in destination vector. Following pseudo code describes the
1532  * algorithm in detail:
1533  *
1534  * DST_VEC = ZERO_VEC
1535  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1536  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1537  * FOREACH_ITER:
1538  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1539  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1540  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1541  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1542  *
1543  * With each iteration, doubleword permute indices (0,1) corresponding
1544  * to gathered quadword gets right shifted by two lane positions.
1545  *
1546  */
1547 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1548                                         Register base, Register idx_base,
1549                                         Register mask, XMMRegister xtmp1,
1550                                         XMMRegister xtmp2, XMMRegister temp_dst,
1551                                         Register rtmp, Register mask_idx,
1552                                         Register length, int vector_len, int vlen_enc) {
1553   Label GATHER8_LOOP;
1554   assert(is_subword_type(elem_ty), "");
1555   movl(length, vector_len);
1556   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1557   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1558   vallones(xtmp2, vlen_enc);
1559   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1560   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1561   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1562 
1563   bind(GATHER8_LOOP);
1564     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1565     if (mask == noreg) {
1566       vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc);
1567     } else {
1568       vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc);
1569     }
1570     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1571     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1572     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1573     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1574     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1575     vpor(dst, dst, temp_dst, vlen_enc);
1576     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1577     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1578     jcc(Assembler::notEqual, GATHER8_LOOP);
1579 }
1580 
1581 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1582   switch(typ) {
1583     case T_INT:
1584       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1585       break;
1586     case T_FLOAT:
1587       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1588       break;
1589     case T_LONG:
1590       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1591       break;
1592     case T_DOUBLE:
1593       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1594       break;
1595     default:
1596       assert(false,"Should not reach here.");
1597       break;
1598   }
1599 }
1600 
1601 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1602   switch(typ) {
1603     case T_INT:
1604       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1605       break;
1606     case T_FLOAT:
1607       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1608       break;
1609     case T_LONG:
1610       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1611       break;
1612     case T_DOUBLE:
1613       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1614       break;
1615     default:
1616       assert(false,"Should not reach here.");
1617       break;
1618   }
1619 }
1620 
1621 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1622   switch(typ) {
1623     case T_INT:
1624       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1625       break;
1626     case T_FLOAT:
1627       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1628       break;
1629     case T_LONG:
1630       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1631       break;
1632     case T_DOUBLE:
1633       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1634       break;
1635     default:
1636       assert(false,"Should not reach here.");
1637       break;
1638   }
1639 }
1640 
1641 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1642   if (vlen_in_bytes <= 16) {
1643     pxor (dst, dst);
1644     psubb(dst, src);
1645     switch (elem_bt) {
1646       case T_BYTE:   /* nothing to do */ break;
1647       case T_SHORT:  pmovsxbw(dst, dst); break;
1648       case T_INT:    pmovsxbd(dst, dst); break;
1649       case T_FLOAT:  pmovsxbd(dst, dst); break;
1650       case T_LONG:   pmovsxbq(dst, dst); break;
1651       case T_DOUBLE: pmovsxbq(dst, dst); break;
1652 
1653       default: assert(false, "%s", type2name(elem_bt));
1654     }
1655   } else {
1656     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1657     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1658 
1659     vpxor (dst, dst, dst, vlen_enc);
1660     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1661 
1662     switch (elem_bt) {
1663       case T_BYTE:   /* nothing to do */            break;
1664       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1665       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1666       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1667       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1668       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1669 
1670       default: assert(false, "%s", type2name(elem_bt));
1671     }
1672   }
1673 }
1674 
1675 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1676   if (novlbwdq) {
1677     vpmovsxbd(xtmp, src, vlen_enc);
1678     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1679             Assembler::eq, true, vlen_enc, noreg);
1680   } else {
1681     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1682     vpsubb(xtmp, xtmp, src, vlen_enc);
1683     evpmovb2m(dst, xtmp, vlen_enc);
1684   }
1685 }
1686 
1687 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1688   if (is_integral_type(bt)) {
1689     switch (vlen_in_bytes) {
1690       case 4:  movdl(dst, src);   break;
1691       case 8:  movq(dst, src);    break;
1692       case 16: movdqu(dst, src);  break;
1693       case 32: vmovdqu(dst, src); break;
1694       case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1695       default: ShouldNotReachHere();
1696     }
1697   } else {
1698     switch (vlen_in_bytes) {
1699       case 4:  movflt(dst, src); break;
1700       case 8:  movdbl(dst, src); break;
1701       case 16: movups(dst, src); break;
1702       case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1703       case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1704       default: ShouldNotReachHere();
1705     }
1706   }
1707 }
1708 
1709 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1710   assert(rscratch != noreg || always_reachable(src), "missing");
1711 
1712   if (reachable(src)) {
1713     load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1714   } else {
1715     lea(rscratch, src);
1716     load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1717   }
1718 }
1719 
1720 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1721   int vlen_enc = vector_length_encoding(vlen);
1722   if (VM_Version::supports_avx()) {
1723     if (bt == T_LONG) {
1724       if (VM_Version::supports_avx2()) {
1725         vpbroadcastq(dst, src, vlen_enc);
1726       } else {
1727         vmovddup(dst, src, vlen_enc);
1728       }
1729     } else if (bt == T_DOUBLE) {
1730       if (vlen_enc != Assembler::AVX_128bit) {
1731         vbroadcastsd(dst, src, vlen_enc, noreg);
1732       } else {
1733         vmovddup(dst, src, vlen_enc);
1734       }
1735     } else {
1736       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1737         vpbroadcastd(dst, src, vlen_enc);
1738       } else {
1739         vbroadcastss(dst, src, vlen_enc);
1740       }
1741     }
1742   } else if (VM_Version::supports_sse3()) {
1743     movddup(dst, src);
1744   } else {
1745     load_vector(bt, dst, src, vlen);
1746   }
1747 }
1748 
1749 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1750   int entry_idx = vector_iota_entry_index(bt);
1751   ExternalAddress addr(StubRoutines::x86::vector_iota_indices(entry_idx));
1752   load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1753 }
1754 
1755 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1756 
1757 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1758   int vector_len = Assembler::AVX_128bit;
1759 
1760   switch (opcode) {
1761     case Op_AndReductionV:  pand(dst, src); break;
1762     case Op_OrReductionV:   por (dst, src); break;
1763     case Op_XorReductionV:  pxor(dst, src); break;
1764     case Op_MinReductionV:
1765       switch (typ) {
1766         case T_BYTE:        pminsb(dst, src); break;
1767         case T_SHORT:       pminsw(dst, src); break;
1768         case T_INT:         pminsd(dst, src); break;
1769         case T_LONG:        assert(UseAVX > 2, "required");
1770                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1771         default:            assert(false, "wrong type");
1772       }
1773       break;
1774     case Op_MaxReductionV:
1775       switch (typ) {
1776         case T_BYTE:        pmaxsb(dst, src); break;
1777         case T_SHORT:       pmaxsw(dst, src); break;
1778         case T_INT:         pmaxsd(dst, src); break;
1779         case T_LONG:        assert(UseAVX > 2, "required");
1780                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1781         default:            assert(false, "wrong type");
1782       }
1783       break;
1784     case Op_UMinReductionV:
1785       switch (typ) {
1786         case T_BYTE:        vpminub(dst, dst, src, Assembler::AVX_128bit); break;
1787         case T_SHORT:       vpminuw(dst, dst, src, Assembler::AVX_128bit); break;
1788         case T_INT:         vpminud(dst, dst, src, Assembler::AVX_128bit); break;
1789         case T_LONG:        evpminuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break;
1790         default:            assert(false, "wrong type");
1791       }
1792       break;
1793     case Op_UMaxReductionV:
1794       switch (typ) {
1795         case T_BYTE:        vpmaxub(dst, dst, src, Assembler::AVX_128bit); break;
1796         case T_SHORT:       vpmaxuw(dst, dst, src, Assembler::AVX_128bit); break;
1797         case T_INT:         vpmaxud(dst, dst, src, Assembler::AVX_128bit); break;
1798         case T_LONG:        evpmaxuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break;
1799         default:            assert(false, "wrong type");
1800       }
1801       break;
1802     case Op_AddReductionVF: addss(dst, src); break;
1803     case Op_AddReductionVD: addsd(dst, src); break;
1804     case Op_AddReductionVI:
1805       switch (typ) {
1806         case T_BYTE:        paddb(dst, src); break;
1807         case T_SHORT:       paddw(dst, src); break;
1808         case T_INT:         paddd(dst, src); break;
1809         default:            assert(false, "wrong type");
1810       }
1811       break;
1812     case Op_AddReductionVL: paddq(dst, src); break;
1813     case Op_MulReductionVF: mulss(dst, src); break;
1814     case Op_MulReductionVD: mulsd(dst, src); break;
1815     case Op_MulReductionVI:
1816       switch (typ) {
1817         case T_SHORT:       pmullw(dst, src); break;
1818         case T_INT:         pmulld(dst, src); break;
1819         default:            assert(false, "wrong type");
1820       }
1821       break;
1822     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1823                             evpmullq(dst, dst, src, vector_len); break;
1824     default:                assert(false, "wrong opcode");
1825   }
1826 }
1827 
1828 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1829   switch (opcode) {
1830     case Op_AddReductionVF: addps(dst, src); break;
1831     case Op_AddReductionVD: addpd(dst, src); break;
1832     case Op_MulReductionVF: mulps(dst, src); break;
1833     case Op_MulReductionVD: mulpd(dst, src); break;
1834     default:                assert(false, "%s", NodeClassNames[opcode]);
1835   }
1836 }
1837 
1838 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1839   int vector_len = Assembler::AVX_256bit;
1840 
1841   switch (opcode) {
1842     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1843     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1844     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1845     case Op_MinReductionV:
1846       switch (typ) {
1847         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1848         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1849         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1850         case T_LONG:        assert(UseAVX > 2, "required");
1851                             vpminsq(dst, src1, src2, vector_len); break;
1852         default:            assert(false, "wrong type");
1853       }
1854       break;
1855     case Op_MaxReductionV:
1856       switch (typ) {
1857         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1858         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1859         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1860         case T_LONG:        assert(UseAVX > 2, "required");
1861                             vpmaxsq(dst, src1, src2, vector_len); break;
1862         default:            assert(false, "wrong type");
1863       }
1864       break;
1865     case Op_UMinReductionV:
1866       switch (typ) {
1867         case T_BYTE:        vpminub(dst, src1, src2, vector_len); break;
1868         case T_SHORT:       vpminuw(dst, src1, src2, vector_len); break;
1869         case T_INT:         vpminud(dst, src1, src2, vector_len); break;
1870         case T_LONG:        evpminuq(dst, k0, src1, src2, true, vector_len); break;
1871         default:            assert(false, "wrong type");
1872       }
1873       break;
1874     case Op_UMaxReductionV:
1875       switch (typ) {
1876         case T_BYTE:        vpmaxub(dst, src1, src2, vector_len); break;
1877         case T_SHORT:       vpmaxuw(dst, src1, src2, vector_len); break;
1878         case T_INT:         vpmaxud(dst, src1, src2, vector_len); break;
1879         case T_LONG:        evpmaxuq(dst, k0, src1, src2, true, vector_len); break;
1880         default:            assert(false, "wrong type");
1881       }
1882       break;
1883     case Op_AddReductionVI:
1884       switch (typ) {
1885         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1886         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1887         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1888         default:            assert(false, "wrong type");
1889       }
1890       break;
1891     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1892     case Op_MulReductionVI:
1893       switch (typ) {
1894         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1895         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1896         default:            assert(false, "wrong type");
1897       }
1898       break;
1899     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1900     default:                assert(false, "wrong opcode");
1901   }
1902 }
1903 
1904 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1905   int vector_len = Assembler::AVX_256bit;
1906 
1907   switch (opcode) {
1908     case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1909     case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1910     case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1911     case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1912     default:                assert(false, "%s", NodeClassNames[opcode]);
1913   }
1914 }
1915 
1916 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1917                                   XMMRegister dst, XMMRegister src,
1918                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1919   switch (opcode) {
1920     case Op_AddReductionVF:
1921     case Op_MulReductionVF:
1922       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1923       break;
1924 
1925     case Op_AddReductionVD:
1926     case Op_MulReductionVD:
1927       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1928       break;
1929 
1930     default: assert(false, "wrong opcode");
1931   }
1932 }
1933 
1934 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1935                                             XMMRegister dst, XMMRegister src,
1936                                             XMMRegister vtmp1, XMMRegister vtmp2) {
1937   switch (opcode) {
1938     case Op_AddReductionVF:
1939     case Op_MulReductionVF:
1940       unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1941       break;
1942 
1943     case Op_AddReductionVD:
1944     case Op_MulReductionVD:
1945       unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1946       break;
1947 
1948     default: assert(false, "%s", NodeClassNames[opcode]);
1949   }
1950 }
1951 
1952 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1953                              Register dst, Register src1, XMMRegister src2,
1954                              XMMRegister vtmp1, XMMRegister vtmp2) {
1955   switch (vlen) {
1956     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1957     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1958     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1959     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1960 
1961     default: assert(false, "wrong vector length");
1962   }
1963 }
1964 
1965 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1966                              Register dst, Register src1, XMMRegister src2,
1967                              XMMRegister vtmp1, XMMRegister vtmp2) {
1968   switch (vlen) {
1969     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1970     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1971     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1972     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1973 
1974     default: assert(false, "wrong vector length");
1975   }
1976 }
1977 
1978 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1979                              Register dst, Register src1, XMMRegister src2,
1980                              XMMRegister vtmp1, XMMRegister vtmp2) {
1981   switch (vlen) {
1982     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1983     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1984     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1985     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1986 
1987     default: assert(false, "wrong vector length");
1988   }
1989 }
1990 
1991 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1992                              Register dst, Register src1, XMMRegister src2,
1993                              XMMRegister vtmp1, XMMRegister vtmp2) {
1994   switch (vlen) {
1995     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1996     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1997     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1998     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1999 
2000     default: assert(false, "wrong vector length");
2001   }
2002 }
2003 
2004 void C2_MacroAssembler::reduceL(int opcode, int vlen,
2005                              Register dst, Register src1, XMMRegister src2,
2006                              XMMRegister vtmp1, XMMRegister vtmp2) {
2007   switch (vlen) {
2008     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2009     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2010     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2011 
2012     default: assert(false, "wrong vector length");
2013   }
2014 }
2015 
2016 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2017   switch (vlen) {
2018     case 2:
2019       assert(vtmp2 == xnoreg, "");
2020       reduce2F(opcode, dst, src, vtmp1);
2021       break;
2022     case 4:
2023       assert(vtmp2 == xnoreg, "");
2024       reduce4F(opcode, dst, src, vtmp1);
2025       break;
2026     case 8:
2027       reduce8F(opcode, dst, src, vtmp1, vtmp2);
2028       break;
2029     case 16:
2030       reduce16F(opcode, dst, src, vtmp1, vtmp2);
2031       break;
2032     default: assert(false, "wrong vector length");
2033   }
2034 }
2035 
2036 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2037   switch (vlen) {
2038     case 2:
2039       assert(vtmp2 == xnoreg, "");
2040       reduce2D(opcode, dst, src, vtmp1);
2041       break;
2042     case 4:
2043       reduce4D(opcode, dst, src, vtmp1, vtmp2);
2044       break;
2045     case 8:
2046       reduce8D(opcode, dst, src, vtmp1, vtmp2);
2047       break;
2048     default: assert(false, "wrong vector length");
2049   }
2050 }
2051 
2052 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2053   switch (vlen) {
2054     case 2:
2055       assert(vtmp1 == xnoreg, "");
2056       assert(vtmp2 == xnoreg, "");
2057       unorderedReduce2F(opcode, dst, src);
2058       break;
2059     case 4:
2060       assert(vtmp2 == xnoreg, "");
2061       unorderedReduce4F(opcode, dst, src, vtmp1);
2062       break;
2063     case 8:
2064       unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
2065       break;
2066     case 16:
2067       unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
2068       break;
2069     default: assert(false, "wrong vector length");
2070   }
2071 }
2072 
2073 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2074   switch (vlen) {
2075     case 2:
2076       assert(vtmp1 == xnoreg, "");
2077       assert(vtmp2 == xnoreg, "");
2078       unorderedReduce2D(opcode, dst, src);
2079       break;
2080     case 4:
2081       assert(vtmp2 == xnoreg, "");
2082       unorderedReduce4D(opcode, dst, src, vtmp1);
2083       break;
2084     case 8:
2085       unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2086       break;
2087     default: assert(false, "wrong vector length");
2088   }
2089 }
2090 
2091 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2092   if (opcode == Op_AddReductionVI) {
2093     if (vtmp1 != src2) {
2094       movdqu(vtmp1, src2);
2095     }
2096     phaddd(vtmp1, vtmp1);
2097   } else {
2098     pshufd(vtmp1, src2, 0x1);
2099     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2100   }
2101   movdl(vtmp2, src1);
2102   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2103   movdl(dst, vtmp1);
2104 }
2105 
2106 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2107   if (opcode == Op_AddReductionVI) {
2108     if (vtmp1 != src2) {
2109       movdqu(vtmp1, src2);
2110     }
2111     phaddd(vtmp1, src2);
2112     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2113   } else {
2114     pshufd(vtmp2, src2, 0xE);
2115     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2116     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2117   }
2118 }
2119 
2120 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2121   if (opcode == Op_AddReductionVI) {
2122     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2123     vextracti128_high(vtmp2, vtmp1);
2124     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2125     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2126   } else {
2127     vextracti128_high(vtmp1, src2);
2128     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2129     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2130   }
2131 }
2132 
2133 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2134   vextracti64x4_high(vtmp2, src2);
2135   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2136   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2137 }
2138 
2139 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2140   pshufd(vtmp2, src2, 0x1);
2141   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2142   movdqu(vtmp1, vtmp2);
2143   psrldq(vtmp1, 2);
2144   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2145   movdqu(vtmp2, vtmp1);
2146   psrldq(vtmp2, 1);
2147   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2148   movdl(vtmp2, src1);
2149   if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) {
2150     pmovzxbd(vtmp1, vtmp1);
2151   } else {
2152     pmovsxbd(vtmp1, vtmp1);
2153   }
2154   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2155   pextrb(dst, vtmp1, 0x0);
2156   movsbl(dst, dst);
2157 }
2158 
2159 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2160   pshufd(vtmp1, src2, 0xE);
2161   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2162   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2163 }
2164 
2165 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2166   vextracti128_high(vtmp2, src2);
2167   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2168   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2169 }
2170 
2171 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2172   vextracti64x4_high(vtmp1, src2);
2173   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2174   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2175 }
2176 
2177 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2178   pmovsxbw(vtmp2, src2);
2179   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2180 }
2181 
2182 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2183   if (UseAVX > 1) {
2184     int vector_len = Assembler::AVX_256bit;
2185     vpmovsxbw(vtmp1, src2, vector_len);
2186     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2187   } else {
2188     pmovsxbw(vtmp2, src2);
2189     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2190     pshufd(vtmp2, src2, 0xe);
2191     pmovsxbw(vtmp2, vtmp2);
2192     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2193   }
2194 }
2195 
2196 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2197   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2198     int vector_len = Assembler::AVX_512bit;
2199     vpmovsxbw(vtmp1, src2, vector_len);
2200     reduce32S(opcode, dst, src1, vtmp1, vtmp2, vtmp1);
2201   } else {
2202     assert(UseAVX >= 2,"Should not reach here.");
2203     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2204     vextracti128_high(vtmp2, src2);
2205     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2206   }
2207 }
2208 
2209 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2210   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2211   vextracti64x4_high(vtmp2, src2);
2212   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2213 }
2214 
2215 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2216   if (opcode == Op_AddReductionVI) {
2217     if (vtmp1 != src2) {
2218       movdqu(vtmp1, src2);
2219     }
2220     phaddw(vtmp1, vtmp1);
2221     phaddw(vtmp1, vtmp1);
2222   } else {
2223     pshufd(vtmp2, src2, 0x1);
2224     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2225     movdqu(vtmp1, vtmp2);
2226     psrldq(vtmp1, 2);
2227     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2228   }
2229   movdl(vtmp2, src1);
2230   if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) {
2231     pmovzxwd(vtmp1, vtmp1);
2232   } else {
2233     pmovsxwd(vtmp1, vtmp1);
2234   }
2235   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2236   pextrw(dst, vtmp1, 0x0);
2237   movswl(dst, dst);
2238 }
2239 
2240 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2241   if (opcode == Op_AddReductionVI) {
2242     if (vtmp1 != src2) {
2243       movdqu(vtmp1, src2);
2244     }
2245     phaddw(vtmp1, src2);
2246   } else {
2247     assert_different_registers(src2, vtmp1);
2248     pshufd(vtmp1, src2, 0xE);
2249     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2250   }
2251   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2252 }
2253 
2254 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2255   if (opcode == Op_AddReductionVI) {
2256     int vector_len = Assembler::AVX_256bit;
2257     vphaddw(vtmp2, src2, src2, vector_len);
2258     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2259   } else {
2260     assert_different_registers(src2, vtmp2);
2261     vextracti128_high(vtmp2, src2);
2262     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2263   }
2264   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2265 }
2266 
2267 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2268   assert_different_registers(src2, vtmp1);
2269   int vector_len = Assembler::AVX_256bit;
2270   vextracti64x4_high(vtmp1, src2);
2271   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2272   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2273 }
2274 
2275 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2276   pshufd(vtmp2, src2, 0xE);
2277   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2278   movdq(vtmp1, src1);
2279   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2280   movdq(dst, vtmp1);
2281 }
2282 
2283 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2284   vextracti128_high(vtmp1, src2);
2285   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2286   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2287 }
2288 
2289 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2290   vextracti64x4_high(vtmp2, src2);
2291   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2292   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2293 }
2294 
2295 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2296   mov64(temp, -1L);
2297   bzhiq(temp, temp, len);
2298   kmovql(dst, temp);
2299 }
2300 
2301 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2302   reduce_operation_128(T_FLOAT, opcode, dst, src);
2303   pshufd(vtmp, src, 0x1);
2304   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2305 }
2306 
2307 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2308   reduce2F(opcode, dst, src, vtmp);
2309   pshufd(vtmp, src, 0x2);
2310   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2311   pshufd(vtmp, src, 0x3);
2312   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2313 }
2314 
2315 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2316   reduce4F(opcode, dst, src, vtmp2);
2317   vextractf128_high(vtmp2, src);
2318   reduce4F(opcode, dst, vtmp2, vtmp1);
2319 }
2320 
2321 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2322   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2323   vextracti64x4_high(vtmp1, src);
2324   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2325 }
2326 
2327 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2328   pshufd(dst, src, 0x1);
2329   reduce_operation_128(T_FLOAT, opcode, dst, src);
2330 }
2331 
2332 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2333   pshufd(vtmp, src, 0xE);
2334   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2335   unorderedReduce2F(opcode, dst, vtmp);
2336 }
2337 
2338 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2339   vextractf128_high(vtmp1, src);
2340   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2341   unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2342 }
2343 
2344 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2345   vextractf64x4_high(vtmp2, src);
2346   unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2347   unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2348 }
2349 
2350 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2351   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2352   pshufd(vtmp, src, 0xE);
2353   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2354 }
2355 
2356 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2357   reduce2D(opcode, dst, src, vtmp2);
2358   vextractf128_high(vtmp2, src);
2359   reduce2D(opcode, dst, vtmp2, vtmp1);
2360 }
2361 
2362 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2363   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2364   vextracti64x4_high(vtmp1, src);
2365   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2366 }
2367 
2368 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2369   pshufd(dst, src, 0xE);
2370   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2371 }
2372 
2373 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2374   vextractf128_high(vtmp, src);
2375   unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2376   unorderedReduce2D(opcode, dst, vtmp);
2377 }
2378 
2379 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2380   vextractf64x4_high(vtmp2, src);
2381   unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2382   unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2383 }
2384 
2385 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2386   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2387 }
2388 
2389 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2390   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2391 }
2392 
2393 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2394   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2395 }
2396 
2397 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2398                                  int vec_enc) {
2399   switch(elem_bt) {
2400     case T_INT:
2401     case T_FLOAT:
2402       vmaskmovps(dst, src, mask, vec_enc);
2403       break;
2404     case T_LONG:
2405     case T_DOUBLE:
2406       vmaskmovpd(dst, src, mask, vec_enc);
2407       break;
2408     default:
2409       fatal("Unsupported type %s", type2name(elem_bt));
2410       break;
2411   }
2412 }
2413 
2414 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2415                                  int vec_enc) {
2416   switch(elem_bt) {
2417     case T_INT:
2418     case T_FLOAT:
2419       vmaskmovps(dst, src, mask, vec_enc);
2420       break;
2421     case T_LONG:
2422     case T_DOUBLE:
2423       vmaskmovpd(dst, src, mask, vec_enc);
2424       break;
2425     default:
2426       fatal("Unsupported type %s", type2name(elem_bt));
2427       break;
2428   }
2429 }
2430 
2431 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2432                                           XMMRegister dst, XMMRegister src,
2433                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2434                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2435   const int permconst[] = {1, 14};
2436   XMMRegister wsrc = src;
2437   XMMRegister wdst = xmm_0;
2438   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2439 
2440   int vlen_enc = Assembler::AVX_128bit;
2441   if (vlen == 16) {
2442     vlen_enc = Assembler::AVX_256bit;
2443   }
2444 
2445   for (int i = log2(vlen) - 1; i >=0; i--) {
2446     if (i == 0 && !is_dst_valid) {
2447       wdst = dst;
2448     }
2449     if (i == 3) {
2450       vextracti64x4_high(wtmp, wsrc);
2451     } else if (i == 2) {
2452       vextracti128_high(wtmp, wsrc);
2453     } else { // i = [0,1]
2454       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2455     }
2456 
2457     if (VM_Version::supports_avx10_2()) {
2458       vminmax_fp_avx10_2(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc);
2459     } else {
2460       vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2461     }
2462     wsrc = wdst;
2463     vlen_enc = Assembler::AVX_128bit;
2464   }
2465   if (is_dst_valid) {
2466     if (VM_Version::supports_avx10_2()) {
2467       vminmax_fp_avx10_2(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit);
2468     } else {
2469       vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2470     }
2471   }
2472 }
2473 
2474 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2475                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2476                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2477   XMMRegister wsrc = src;
2478   XMMRegister wdst = xmm_0;
2479   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2480   int vlen_enc = Assembler::AVX_128bit;
2481   if (vlen == 8) {
2482     vlen_enc = Assembler::AVX_256bit;
2483   }
2484   for (int i = log2(vlen) - 1; i >=0; i--) {
2485     if (i == 0 && !is_dst_valid) {
2486       wdst = dst;
2487     }
2488     if (i == 1) {
2489       vextracti128_high(wtmp, wsrc);
2490     } else if (i == 2) {
2491       vextracti64x4_high(wtmp, wsrc);
2492     } else {
2493       assert(i == 0, "%d", i);
2494       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2495     }
2496 
2497     if (VM_Version::supports_avx10_2()) {
2498       vminmax_fp_avx10_2(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc);
2499     } else {
2500       vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2501     }
2502 
2503     wsrc = wdst;
2504     vlen_enc = Assembler::AVX_128bit;
2505   }
2506 
2507   if (is_dst_valid) {
2508     if (VM_Version::supports_avx10_2()) {
2509       vminmax_fp_avx10_2(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit);
2510     } else {
2511       vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2512     }
2513   }
2514 }
2515 
2516 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2517   switch (bt) {
2518     case T_BYTE:  pextrb(dst, src, idx); break;
2519     case T_SHORT: pextrw(dst, src, idx); break;
2520     case T_INT:   pextrd(dst, src, idx); break;
2521     case T_LONG:  pextrq(dst, src, idx); break;
2522 
2523     default:
2524       assert(false,"Should not reach here.");
2525       break;
2526   }
2527 }
2528 
2529 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2530   int esize =  type2aelembytes(typ);
2531   int elem_per_lane = 16/esize;
2532   int lane = elemindex / elem_per_lane;
2533   int eindex = elemindex % elem_per_lane;
2534 
2535   if (lane >= 2) {
2536     assert(UseAVX > 2, "required");
2537     vextractf32x4(dst, src, lane & 3);
2538     return dst;
2539   } else if (lane > 0) {
2540     assert(UseAVX > 0, "required");
2541     vextractf128(dst, src, lane);
2542     return dst;
2543   } else {
2544     return src;
2545   }
2546 }
2547 
2548 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2549   if (typ == T_BYTE) {
2550     movsbl(dst, dst);
2551   } else if (typ == T_SHORT) {
2552     movswl(dst, dst);
2553   }
2554 }
2555 
2556 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2557   int esize =  type2aelembytes(typ);
2558   int elem_per_lane = 16/esize;
2559   int eindex = elemindex % elem_per_lane;
2560   assert(is_integral_type(typ),"required");
2561 
2562   if (eindex == 0) {
2563     if (typ == T_LONG) {
2564       movq(dst, src);
2565     } else {
2566       movdl(dst, src);
2567       movsxl(typ, dst);
2568     }
2569   } else {
2570     extract(typ, dst, src, eindex);
2571     movsxl(typ, dst);
2572   }
2573 }
2574 
2575 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2576   int esize =  type2aelembytes(typ);
2577   int elem_per_lane = 16/esize;
2578   int eindex = elemindex % elem_per_lane;
2579   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2580 
2581   if (eindex == 0) {
2582     movq(dst, src);
2583   } else {
2584     if (typ == T_FLOAT) {
2585       if (UseAVX == 0) {
2586         movdqu(dst, src);
2587         shufps(dst, dst, eindex);
2588       } else {
2589         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2590       }
2591     } else {
2592       if (UseAVX == 0) {
2593         movdqu(dst, src);
2594         psrldq(dst, eindex*esize);
2595       } else {
2596         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2597       }
2598       movq(dst, dst);
2599     }
2600   }
2601   // Zero upper bits
2602   if (typ == T_FLOAT) {
2603     if (UseAVX == 0) {
2604       assert(vtmp != xnoreg, "required.");
2605       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2606       pand(dst, vtmp);
2607     } else {
2608       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2609     }
2610   }
2611 }
2612 
2613 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2614   switch(typ) {
2615     case T_BYTE:
2616     case T_BOOLEAN:
2617       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2618       break;
2619     case T_SHORT:
2620     case T_CHAR:
2621       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2622       break;
2623     case T_INT:
2624     case T_FLOAT:
2625       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2626       break;
2627     case T_LONG:
2628     case T_DOUBLE:
2629       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2630       break;
2631     default:
2632       assert(false,"Should not reach here.");
2633       break;
2634   }
2635 }
2636 
2637 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2638   assert(rscratch != noreg || always_reachable(src2), "missing");
2639 
2640   switch(typ) {
2641     case T_BOOLEAN:
2642     case T_BYTE:
2643       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2644       break;
2645     case T_CHAR:
2646     case T_SHORT:
2647       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2648       break;
2649     case T_INT:
2650     case T_FLOAT:
2651       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2652       break;
2653     case T_LONG:
2654     case T_DOUBLE:
2655       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2656       break;
2657     default:
2658       assert(false,"Should not reach here.");
2659       break;
2660   }
2661 }
2662 
2663 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2664   switch(typ) {
2665     case T_BYTE:
2666       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2667       break;
2668     case T_SHORT:
2669       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2670       break;
2671     case T_INT:
2672     case T_FLOAT:
2673       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2674       break;
2675     case T_LONG:
2676     case T_DOUBLE:
2677       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2678       break;
2679     default:
2680       assert(false,"Should not reach here.");
2681       break;
2682   }
2683 }
2684 
2685 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2686   assert(vlen_in_bytes <= 32, "");
2687   int esize = type2aelembytes(bt);
2688   if (vlen_in_bytes == 32) {
2689     assert(vtmp == xnoreg, "required.");
2690     if (esize >= 4) {
2691       vtestps(src1, src2, AVX_256bit);
2692     } else {
2693       vptest(src1, src2, AVX_256bit);
2694     }
2695     return;
2696   }
2697   if (vlen_in_bytes < 16) {
2698     // Duplicate the lower part to fill the whole register,
2699     // Don't need to do so for src2
2700     assert(vtmp != xnoreg, "required");
2701     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2702     pshufd(vtmp, src1, shuffle_imm);
2703   } else {
2704     assert(vtmp == xnoreg, "required");
2705     vtmp = src1;
2706   }
2707   if (esize >= 4 && VM_Version::supports_avx()) {
2708     vtestps(vtmp, src2, AVX_128bit);
2709   } else {
2710     ptest(vtmp, src2);
2711   }
2712 }
2713 
2714 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2715 #ifdef ASSERT
2716   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2717   bool is_bw_supported = VM_Version::supports_avx512bw();
2718   if (is_bw && !is_bw_supported) {
2719     assert(vlen_enc != Assembler::AVX_512bit, "required");
2720     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2721            "XMM register should be 0-15");
2722   }
2723 #endif // ASSERT
2724   switch (elem_bt) {
2725     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2726     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2727     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2728     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2729     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2730     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2731     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2732   }
2733 }
2734 
2735 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2736   assert(UseAVX >= 2, "required");
2737   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2738   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2739   if ((UseAVX > 2) &&
2740       (!is_bw || VM_Version::supports_avx512bw()) &&
2741       (!is_vl || VM_Version::supports_avx512vl())) {
2742     switch (elem_bt) {
2743       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2744       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2745       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2746       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2747       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2748     }
2749   } else {
2750     assert(vlen_enc != Assembler::AVX_512bit, "required");
2751     assert((dst->encoding() < 16),"XMM register should be 0-15");
2752     switch (elem_bt) {
2753       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2754       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2755       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2756       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2757       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2758       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2759       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2760     }
2761   }
2762 }
2763 
2764 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2765   switch (to_elem_bt) {
2766     case T_SHORT:
2767       vpmovsxbw(dst, src, vlen_enc);
2768       break;
2769     case T_INT:
2770       vpmovsxbd(dst, src, vlen_enc);
2771       break;
2772     case T_FLOAT:
2773       vpmovsxbd(dst, src, vlen_enc);
2774       vcvtdq2ps(dst, dst, vlen_enc);
2775       break;
2776     case T_LONG:
2777       vpmovsxbq(dst, src, vlen_enc);
2778       break;
2779     case T_DOUBLE: {
2780       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2781       vpmovsxbd(dst, src, mid_vlen_enc);
2782       vcvtdq2pd(dst, dst, vlen_enc);
2783       break;
2784     }
2785     default:
2786       fatal("Unsupported type %s", type2name(to_elem_bt));
2787       break;
2788   }
2789 }
2790 
2791 //-------------------------------------------------------------------------------------------
2792 
2793 // IndexOf for constant substrings with size >= 8 chars
2794 // which don't need to be loaded through stack.
2795 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2796                                          Register cnt1, Register cnt2,
2797                                          int int_cnt2,  Register result,
2798                                          XMMRegister vec, Register tmp,
2799                                          int ae) {
2800   ShortBranchVerifier sbv(this);
2801   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2802   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2803 
2804   // This method uses the pcmpestri instruction with bound registers
2805   //   inputs:
2806   //     xmm - substring
2807   //     rax - substring length (elements count)
2808   //     mem - scanned string
2809   //     rdx - string length (elements count)
2810   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2811   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2812   //   outputs:
2813   //     rcx - matched index in string
2814   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2815   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2816   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2817   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2818   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2819 
2820   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2821         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2822         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2823 
2824   // Note, inline_string_indexOf() generates checks:
2825   // if (substr.count > string.count) return -1;
2826   // if (substr.count == 0) return 0;
2827   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2828 
2829   // Load substring.
2830   if (ae == StrIntrinsicNode::UL) {
2831     pmovzxbw(vec, Address(str2, 0));
2832   } else {
2833     movdqu(vec, Address(str2, 0));
2834   }
2835   movl(cnt2, int_cnt2);
2836   movptr(result, str1); // string addr
2837 
2838   if (int_cnt2 > stride) {
2839     jmpb(SCAN_TO_SUBSTR);
2840 
2841     // Reload substr for rescan, this code
2842     // is executed only for large substrings (> 8 chars)
2843     bind(RELOAD_SUBSTR);
2844     if (ae == StrIntrinsicNode::UL) {
2845       pmovzxbw(vec, Address(str2, 0));
2846     } else {
2847       movdqu(vec, Address(str2, 0));
2848     }
2849     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2850 
2851     bind(RELOAD_STR);
2852     // We came here after the beginning of the substring was
2853     // matched but the rest of it was not so we need to search
2854     // again. Start from the next element after the previous match.
2855 
2856     // cnt2 is number of substring reminding elements and
2857     // cnt1 is number of string reminding elements when cmp failed.
2858     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2859     subl(cnt1, cnt2);
2860     addl(cnt1, int_cnt2);
2861     movl(cnt2, int_cnt2); // Now restore cnt2
2862 
2863     decrementl(cnt1);     // Shift to next element
2864     cmpl(cnt1, cnt2);
2865     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2866 
2867     addptr(result, (1<<scale1));
2868 
2869   } // (int_cnt2 > 8)
2870 
2871   // Scan string for start of substr in 16-byte vectors
2872   bind(SCAN_TO_SUBSTR);
2873   pcmpestri(vec, Address(result, 0), mode);
2874   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2875   subl(cnt1, stride);
2876   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2877   cmpl(cnt1, cnt2);
2878   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2879   addptr(result, 16);
2880   jmpb(SCAN_TO_SUBSTR);
2881 
2882   // Found a potential substr
2883   bind(FOUND_CANDIDATE);
2884   // Matched whole vector if first element matched (tmp(rcx) == 0).
2885   if (int_cnt2 == stride) {
2886     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2887   } else { // int_cnt2 > 8
2888     jccb(Assembler::overflow, FOUND_SUBSTR);
2889   }
2890   // After pcmpestri tmp(rcx) contains matched element index
2891   // Compute start addr of substr
2892   lea(result, Address(result, tmp, scale1));
2893 
2894   // Make sure string is still long enough
2895   subl(cnt1, tmp);
2896   cmpl(cnt1, cnt2);
2897   if (int_cnt2 == stride) {
2898     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2899   } else { // int_cnt2 > 8
2900     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2901   }
2902   // Left less then substring.
2903 
2904   bind(RET_NOT_FOUND);
2905   movl(result, -1);
2906   jmp(EXIT);
2907 
2908   if (int_cnt2 > stride) {
2909     // This code is optimized for the case when whole substring
2910     // is matched if its head is matched.
2911     bind(MATCH_SUBSTR_HEAD);
2912     pcmpestri(vec, Address(result, 0), mode);
2913     // Reload only string if does not match
2914     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2915 
2916     Label CONT_SCAN_SUBSTR;
2917     // Compare the rest of substring (> 8 chars).
2918     bind(FOUND_SUBSTR);
2919     // First 8 chars are already matched.
2920     negptr(cnt2);
2921     addptr(cnt2, stride);
2922 
2923     bind(SCAN_SUBSTR);
2924     subl(cnt1, stride);
2925     cmpl(cnt2, -stride); // Do not read beyond substring
2926     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2927     // Back-up strings to avoid reading beyond substring:
2928     // cnt1 = cnt1 - cnt2 + 8
2929     addl(cnt1, cnt2); // cnt2 is negative
2930     addl(cnt1, stride);
2931     movl(cnt2, stride); negptr(cnt2);
2932     bind(CONT_SCAN_SUBSTR);
2933     if (int_cnt2 < (int)G) {
2934       int tail_off1 = int_cnt2<<scale1;
2935       int tail_off2 = int_cnt2<<scale2;
2936       if (ae == StrIntrinsicNode::UL) {
2937         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2938       } else {
2939         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2940       }
2941       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2942     } else {
2943       // calculate index in register to avoid integer overflow (int_cnt2*2)
2944       movl(tmp, int_cnt2);
2945       addptr(tmp, cnt2);
2946       if (ae == StrIntrinsicNode::UL) {
2947         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2948       } else {
2949         movdqu(vec, Address(str2, tmp, scale2, 0));
2950       }
2951       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2952     }
2953     // Need to reload strings pointers if not matched whole vector
2954     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2955     addptr(cnt2, stride);
2956     jcc(Assembler::negative, SCAN_SUBSTR);
2957     // Fall through if found full substring
2958 
2959   } // (int_cnt2 > 8)
2960 
2961   bind(RET_FOUND);
2962   // Found result if we matched full small substring.
2963   // Compute substr offset
2964   subptr(result, str1);
2965   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2966     shrl(result, 1); // index
2967   }
2968   bind(EXIT);
2969 
2970 } // string_indexofC8
2971 
2972 // Small strings are loaded through stack if they cross page boundary.
2973 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2974                                        Register cnt1, Register cnt2,
2975                                        int int_cnt2,  Register result,
2976                                        XMMRegister vec, Register tmp,
2977                                        int ae) {
2978   ShortBranchVerifier sbv(this);
2979   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2980   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2981 
2982   //
2983   // int_cnt2 is length of small (< 8 chars) constant substring
2984   // or (-1) for non constant substring in which case its length
2985   // is in cnt2 register.
2986   //
2987   // Note, inline_string_indexOf() generates checks:
2988   // if (substr.count > string.count) return -1;
2989   // if (substr.count == 0) return 0;
2990   //
2991   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2992   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2993   // This method uses the pcmpestri instruction with bound registers
2994   //   inputs:
2995   //     xmm - substring
2996   //     rax - substring length (elements count)
2997   //     mem - scanned string
2998   //     rdx - string length (elements count)
2999   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
3000   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
3001   //   outputs:
3002   //     rcx - matched index in string
3003   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3004   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
3005   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
3006   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
3007 
3008   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
3009         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
3010         FOUND_CANDIDATE;
3011 
3012   { //========================================================
3013     // We don't know where these strings are located
3014     // and we can't read beyond them. Load them through stack.
3015     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
3016 
3017     movptr(tmp, rsp); // save old SP
3018 
3019     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
3020       if (int_cnt2 == (1>>scale2)) { // One byte
3021         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
3022         load_unsigned_byte(result, Address(str2, 0));
3023         movdl(vec, result); // move 32 bits
3024       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
3025         // Not enough header space in 32-bit VM: 12+3 = 15.
3026         movl(result, Address(str2, -1));
3027         shrl(result, 8);
3028         movdl(vec, result); // move 32 bits
3029       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
3030         load_unsigned_short(result, Address(str2, 0));
3031         movdl(vec, result); // move 32 bits
3032       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
3033         movdl(vec, Address(str2, 0)); // move 32 bits
3034       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
3035         movq(vec, Address(str2, 0));  // move 64 bits
3036       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
3037         // Array header size is 12 bytes in 32-bit VM
3038         // + 6 bytes for 3 chars == 18 bytes,
3039         // enough space to load vec and shift.
3040         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
3041         if (ae == StrIntrinsicNode::UL) {
3042           int tail_off = int_cnt2-8;
3043           pmovzxbw(vec, Address(str2, tail_off));
3044           psrldq(vec, -2*tail_off);
3045         }
3046         else {
3047           int tail_off = int_cnt2*(1<<scale2);
3048           movdqu(vec, Address(str2, tail_off-16));
3049           psrldq(vec, 16-tail_off);
3050         }
3051       }
3052     } else { // not constant substring
3053       cmpl(cnt2, stride);
3054       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3055 
3056       // We can read beyond string if srt+16 does not cross page boundary
3057       // since heaps are aligned and mapped by pages.
3058       assert(os::vm_page_size() < (int)G, "default page should be small");
3059       movl(result, str2); // We need only low 32 bits
3060       andl(result, ((int)os::vm_page_size()-1));
3061       cmpl(result, ((int)os::vm_page_size()-16));
3062       jccb(Assembler::belowEqual, CHECK_STR);
3063 
3064       // Move small strings to stack to allow load 16 bytes into vec.
3065       subptr(rsp, 16);
3066       int stk_offset = wordSize-(1<<scale2);
3067       push(cnt2);
3068 
3069       bind(COPY_SUBSTR);
3070       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3071         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3072         movb(Address(rsp, cnt2, scale2, stk_offset), result);
3073       } else if (ae == StrIntrinsicNode::UU) {
3074         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3075         movw(Address(rsp, cnt2, scale2, stk_offset), result);
3076       }
3077       decrement(cnt2);
3078       jccb(Assembler::notZero, COPY_SUBSTR);
3079 
3080       pop(cnt2);
3081       movptr(str2, rsp);  // New substring address
3082     } // non constant
3083 
3084     bind(CHECK_STR);
3085     cmpl(cnt1, stride);
3086     jccb(Assembler::aboveEqual, BIG_STRINGS);
3087 
3088     // Check cross page boundary.
3089     movl(result, str1); // We need only low 32 bits
3090     andl(result, ((int)os::vm_page_size()-1));
3091     cmpl(result, ((int)os::vm_page_size()-16));
3092     jccb(Assembler::belowEqual, BIG_STRINGS);
3093 
3094     subptr(rsp, 16);
3095     int stk_offset = -(1<<scale1);
3096     if (int_cnt2 < 0) { // not constant
3097       push(cnt2);
3098       stk_offset += wordSize;
3099     }
3100     movl(cnt2, cnt1);
3101 
3102     bind(COPY_STR);
3103     if (ae == StrIntrinsicNode::LL) {
3104       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3105       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3106     } else {
3107       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3108       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3109     }
3110     decrement(cnt2);
3111     jccb(Assembler::notZero, COPY_STR);
3112 
3113     if (int_cnt2 < 0) { // not constant
3114       pop(cnt2);
3115     }
3116     movptr(str1, rsp);  // New string address
3117 
3118     bind(BIG_STRINGS);
3119     // Load substring.
3120     if (int_cnt2 < 0) { // -1
3121       if (ae == StrIntrinsicNode::UL) {
3122         pmovzxbw(vec, Address(str2, 0));
3123       } else {
3124         movdqu(vec, Address(str2, 0));
3125       }
3126       push(cnt2);       // substr count
3127       push(str2);       // substr addr
3128       push(str1);       // string addr
3129     } else {
3130       // Small (< 8 chars) constant substrings are loaded already.
3131       movl(cnt2, int_cnt2);
3132     }
3133     push(tmp);  // original SP
3134 
3135   } // Finished loading
3136 
3137   //========================================================
3138   // Start search
3139   //
3140 
3141   movptr(result, str1); // string addr
3142 
3143   if (int_cnt2  < 0) {  // Only for non constant substring
3144     jmpb(SCAN_TO_SUBSTR);
3145 
3146     // SP saved at sp+0
3147     // String saved at sp+1*wordSize
3148     // Substr saved at sp+2*wordSize
3149     // Substr count saved at sp+3*wordSize
3150 
3151     // Reload substr for rescan, this code
3152     // is executed only for large substrings (> 8 chars)
3153     bind(RELOAD_SUBSTR);
3154     movptr(str2, Address(rsp, 2*wordSize));
3155     movl(cnt2, Address(rsp, 3*wordSize));
3156     if (ae == StrIntrinsicNode::UL) {
3157       pmovzxbw(vec, Address(str2, 0));
3158     } else {
3159       movdqu(vec, Address(str2, 0));
3160     }
3161     // We came here after the beginning of the substring was
3162     // matched but the rest of it was not so we need to search
3163     // again. Start from the next element after the previous match.
3164     subptr(str1, result); // Restore counter
3165     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3166       shrl(str1, 1);
3167     }
3168     addl(cnt1, str1);
3169     decrementl(cnt1);   // Shift to next element
3170     cmpl(cnt1, cnt2);
3171     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3172 
3173     addptr(result, (1<<scale1));
3174   } // non constant
3175 
3176   // Scan string for start of substr in 16-byte vectors
3177   bind(SCAN_TO_SUBSTR);
3178   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3179   pcmpestri(vec, Address(result, 0), mode);
3180   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3181   subl(cnt1, stride);
3182   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3183   cmpl(cnt1, cnt2);
3184   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3185   addptr(result, 16);
3186 
3187   bind(ADJUST_STR);
3188   cmpl(cnt1, stride); // Do not read beyond string
3189   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3190   // Back-up string to avoid reading beyond string.
3191   lea(result, Address(result, cnt1, scale1, -16));
3192   movl(cnt1, stride);
3193   jmpb(SCAN_TO_SUBSTR);
3194 
3195   // Found a potential substr
3196   bind(FOUND_CANDIDATE);
3197   // After pcmpestri tmp(rcx) contains matched element index
3198 
3199   // Make sure string is still long enough
3200   subl(cnt1, tmp);
3201   cmpl(cnt1, cnt2);
3202   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3203   // Left less then substring.
3204 
3205   bind(RET_NOT_FOUND);
3206   movl(result, -1);
3207   jmp(CLEANUP);
3208 
3209   bind(FOUND_SUBSTR);
3210   // Compute start addr of substr
3211   lea(result, Address(result, tmp, scale1));
3212   if (int_cnt2 > 0) { // Constant substring
3213     // Repeat search for small substring (< 8 chars)
3214     // from new point without reloading substring.
3215     // Have to check that we don't read beyond string.
3216     cmpl(tmp, stride-int_cnt2);
3217     jccb(Assembler::greater, ADJUST_STR);
3218     // Fall through if matched whole substring.
3219   } else { // non constant
3220     assert(int_cnt2 == -1, "should be != 0");
3221 
3222     addl(tmp, cnt2);
3223     // Found result if we matched whole substring.
3224     cmpl(tmp, stride);
3225     jcc(Assembler::lessEqual, RET_FOUND);
3226 
3227     // Repeat search for small substring (<= 8 chars)
3228     // from new point 'str1' without reloading substring.
3229     cmpl(cnt2, stride);
3230     // Have to check that we don't read beyond string.
3231     jccb(Assembler::lessEqual, ADJUST_STR);
3232 
3233     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3234     // Compare the rest of substring (> 8 chars).
3235     movptr(str1, result);
3236 
3237     cmpl(tmp, cnt2);
3238     // First 8 chars are already matched.
3239     jccb(Assembler::equal, CHECK_NEXT);
3240 
3241     bind(SCAN_SUBSTR);
3242     pcmpestri(vec, Address(str1, 0), mode);
3243     // Need to reload strings pointers if not matched whole vector
3244     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3245 
3246     bind(CHECK_NEXT);
3247     subl(cnt2, stride);
3248     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3249     addptr(str1, 16);
3250     if (ae == StrIntrinsicNode::UL) {
3251       addptr(str2, 8);
3252     } else {
3253       addptr(str2, 16);
3254     }
3255     subl(cnt1, stride);
3256     cmpl(cnt2, stride); // Do not read beyond substring
3257     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3258     // Back-up strings to avoid reading beyond substring.
3259 
3260     if (ae == StrIntrinsicNode::UL) {
3261       lea(str2, Address(str2, cnt2, scale2, -8));
3262       lea(str1, Address(str1, cnt2, scale1, -16));
3263     } else {
3264       lea(str2, Address(str2, cnt2, scale2, -16));
3265       lea(str1, Address(str1, cnt2, scale1, -16));
3266     }
3267     subl(cnt1, cnt2);
3268     movl(cnt2, stride);
3269     addl(cnt1, stride);
3270     bind(CONT_SCAN_SUBSTR);
3271     if (ae == StrIntrinsicNode::UL) {
3272       pmovzxbw(vec, Address(str2, 0));
3273     } else {
3274       movdqu(vec, Address(str2, 0));
3275     }
3276     jmp(SCAN_SUBSTR);
3277 
3278     bind(RET_FOUND_LONG);
3279     movptr(str1, Address(rsp, wordSize));
3280   } // non constant
3281 
3282   bind(RET_FOUND);
3283   // Compute substr offset
3284   subptr(result, str1);
3285   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3286     shrl(result, 1); // index
3287   }
3288   bind(CLEANUP);
3289   pop(rsp); // restore SP
3290 
3291 } // string_indexof
3292 
3293 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3294                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3295   ShortBranchVerifier sbv(this);
3296   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3297 
3298   int stride = 8;
3299 
3300   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3301         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3302         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3303         FOUND_SEQ_CHAR, DONE_LABEL;
3304 
3305   movptr(result, str1);
3306   if (UseAVX >= 2) {
3307     cmpl(cnt1, stride);
3308     jcc(Assembler::less, SCAN_TO_CHAR);
3309     cmpl(cnt1, 2*stride);
3310     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3311     movdl(vec1, ch);
3312     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3313     vpxor(vec2, vec2);
3314     movl(tmp, cnt1);
3315     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3316     andl(cnt1,0x0000000F);  //tail count (in chars)
3317 
3318     bind(SCAN_TO_16_CHAR_LOOP);
3319     vmovdqu(vec3, Address(result, 0));
3320     vpcmpeqw(vec3, vec3, vec1, 1);
3321     vptest(vec2, vec3);
3322     jcc(Assembler::carryClear, FOUND_CHAR);
3323     addptr(result, 32);
3324     subl(tmp, 2*stride);
3325     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3326     jmp(SCAN_TO_8_CHAR);
3327     bind(SCAN_TO_8_CHAR_INIT);
3328     movdl(vec1, ch);
3329     pshuflw(vec1, vec1, 0x00);
3330     pshufd(vec1, vec1, 0);
3331     pxor(vec2, vec2);
3332   }
3333   bind(SCAN_TO_8_CHAR);
3334   cmpl(cnt1, stride);
3335   jcc(Assembler::less, SCAN_TO_CHAR);
3336   if (UseAVX < 2) {
3337     movdl(vec1, ch);
3338     pshuflw(vec1, vec1, 0x00);
3339     pshufd(vec1, vec1, 0);
3340     pxor(vec2, vec2);
3341   }
3342   movl(tmp, cnt1);
3343   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3344   andl(cnt1,0x00000007);  //tail count (in chars)
3345 
3346   bind(SCAN_TO_8_CHAR_LOOP);
3347   movdqu(vec3, Address(result, 0));
3348   pcmpeqw(vec3, vec1);
3349   ptest(vec2, vec3);
3350   jcc(Assembler::carryClear, FOUND_CHAR);
3351   addptr(result, 16);
3352   subl(tmp, stride);
3353   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3354   bind(SCAN_TO_CHAR);
3355   testl(cnt1, cnt1);
3356   jcc(Assembler::zero, RET_NOT_FOUND);
3357   bind(SCAN_TO_CHAR_LOOP);
3358   load_unsigned_short(tmp, Address(result, 0));
3359   cmpl(ch, tmp);
3360   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3361   addptr(result, 2);
3362   subl(cnt1, 1);
3363   jccb(Assembler::zero, RET_NOT_FOUND);
3364   jmp(SCAN_TO_CHAR_LOOP);
3365 
3366   bind(RET_NOT_FOUND);
3367   movl(result, -1);
3368   jmpb(DONE_LABEL);
3369 
3370   bind(FOUND_CHAR);
3371   if (UseAVX >= 2) {
3372     vpmovmskb(tmp, vec3);
3373   } else {
3374     pmovmskb(tmp, vec3);
3375   }
3376   bsfl(ch, tmp);
3377   addptr(result, ch);
3378 
3379   bind(FOUND_SEQ_CHAR);
3380   subptr(result, str1);
3381   shrl(result, 1);
3382 
3383   bind(DONE_LABEL);
3384 } // string_indexof_char
3385 
3386 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3387                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3388   ShortBranchVerifier sbv(this);
3389   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3390 
3391   int stride = 16;
3392 
3393   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3394         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3395         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3396         FOUND_SEQ_CHAR, DONE_LABEL;
3397 
3398   movptr(result, str1);
3399   if (UseAVX >= 2) {
3400     cmpl(cnt1, stride);
3401     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3402     cmpl(cnt1, stride*2);
3403     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3404     movdl(vec1, ch);
3405     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3406     vpxor(vec2, vec2);
3407     movl(tmp, cnt1);
3408     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3409     andl(cnt1,0x0000001F);  //tail count (in chars)
3410 
3411     bind(SCAN_TO_32_CHAR_LOOP);
3412     vmovdqu(vec3, Address(result, 0));
3413     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3414     vptest(vec2, vec3);
3415     jcc(Assembler::carryClear, FOUND_CHAR);
3416     addptr(result, 32);
3417     subl(tmp, stride*2);
3418     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3419     jmp(SCAN_TO_16_CHAR);
3420 
3421     bind(SCAN_TO_16_CHAR_INIT);
3422     movdl(vec1, ch);
3423     pxor(vec2, vec2);
3424     pshufb(vec1, vec2);
3425   }
3426 
3427   bind(SCAN_TO_16_CHAR);
3428   cmpl(cnt1, stride);
3429   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3430   if (UseAVX < 2) {
3431     movdl(vec1, ch);
3432     pxor(vec2, vec2);
3433     pshufb(vec1, vec2);
3434   }
3435   movl(tmp, cnt1);
3436   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3437   andl(cnt1,0x0000000F);  //tail count (in bytes)
3438 
3439   bind(SCAN_TO_16_CHAR_LOOP);
3440   movdqu(vec3, Address(result, 0));
3441   pcmpeqb(vec3, vec1);
3442   ptest(vec2, vec3);
3443   jcc(Assembler::carryClear, FOUND_CHAR);
3444   addptr(result, 16);
3445   subl(tmp, stride);
3446   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3447 
3448   bind(SCAN_TO_CHAR_INIT);
3449   testl(cnt1, cnt1);
3450   jcc(Assembler::zero, RET_NOT_FOUND);
3451   bind(SCAN_TO_CHAR_LOOP);
3452   load_unsigned_byte(tmp, Address(result, 0));
3453   cmpl(ch, tmp);
3454   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3455   addptr(result, 1);
3456   subl(cnt1, 1);
3457   jccb(Assembler::zero, RET_NOT_FOUND);
3458   jmp(SCAN_TO_CHAR_LOOP);
3459 
3460   bind(RET_NOT_FOUND);
3461   movl(result, -1);
3462   jmpb(DONE_LABEL);
3463 
3464   bind(FOUND_CHAR);
3465   if (UseAVX >= 2) {
3466     vpmovmskb(tmp, vec3);
3467   } else {
3468     pmovmskb(tmp, vec3);
3469   }
3470   bsfl(ch, tmp);
3471   addptr(result, ch);
3472 
3473   bind(FOUND_SEQ_CHAR);
3474   subptr(result, str1);
3475 
3476   bind(DONE_LABEL);
3477 } // stringL_indexof_char
3478 
3479 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3480   switch (eltype) {
3481   case T_BOOLEAN: return sizeof(jboolean);
3482   case T_BYTE:  return sizeof(jbyte);
3483   case T_SHORT: return sizeof(jshort);
3484   case T_CHAR:  return sizeof(jchar);
3485   case T_INT:   return sizeof(jint);
3486   default:
3487     ShouldNotReachHere();
3488     return -1;
3489   }
3490 }
3491 
3492 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3493   switch (eltype) {
3494   // T_BOOLEAN used as surrogate for unsigned byte
3495   case T_BOOLEAN: movzbl(dst, src);   break;
3496   case T_BYTE:    movsbl(dst, src);   break;
3497   case T_SHORT:   movswl(dst, src);   break;
3498   case T_CHAR:    movzwl(dst, src);   break;
3499   case T_INT:     movl(dst, src);     break;
3500   default:
3501     ShouldNotReachHere();
3502   }
3503 }
3504 
3505 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3506   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3507 }
3508 
3509 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3510   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3511 }
3512 
3513 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3514   const int vlen = Assembler::AVX_256bit;
3515   switch (eltype) {
3516   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3517   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3518   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3519   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3520   case T_INT:
3521     // do nothing
3522     break;
3523   default:
3524     ShouldNotReachHere();
3525   }
3526 }
3527 
3528 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3529                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3530                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3531                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3532                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3533                                         BasicType eltype) {
3534   ShortBranchVerifier sbv(this);
3535   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3536   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3537   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3538 
3539   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3540         SHORT_UNROLLED_LOOP_EXIT,
3541         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3542         UNROLLED_VECTOR_LOOP_BEGIN,
3543         END;
3544   switch (eltype) {
3545   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3546   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3547   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3548   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3549   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3550   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3551   }
3552 
3553   // For "renaming" for readibility of the code
3554   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3555                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3556                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3557 
3558   const int elsize = arrays_hashcode_elsize(eltype);
3559 
3560   /*
3561     if (cnt1 >= 2) {
3562       if (cnt1 >= 32) {
3563         UNROLLED VECTOR LOOP
3564       }
3565       UNROLLED SCALAR LOOP
3566     }
3567     SINGLE SCALAR
3568    */
3569 
3570   cmpl(cnt1, 32);
3571   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3572 
3573   // cnt1 >= 32 && generate_vectorized_loop
3574   xorl(index, index);
3575 
3576   // vresult = IntVector.zero(I256);
3577   for (int idx = 0; idx < 4; idx++) {
3578     vpxor(vresult[idx], vresult[idx]);
3579   }
3580   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3581   Register bound = tmp2;
3582   Register next = tmp3;
3583   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3584   movl(next, Address(tmp2, 0));
3585   movdl(vnext, next);
3586   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3587 
3588   // index = 0;
3589   // bound = cnt1 & ~(32 - 1);
3590   movl(bound, cnt1);
3591   andl(bound, ~(32 - 1));
3592   // for (; index < bound; index += 32) {
3593   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3594   // result *= next;
3595   imull(result, next);
3596   // loop fission to upfront the cost of fetching from memory, OOO execution
3597   // can then hopefully do a better job of prefetching
3598   for (int idx = 0; idx < 4; idx++) {
3599     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3600   }
3601   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3602   for (int idx = 0; idx < 4; idx++) {
3603     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3604     arrays_hashcode_elvcast(vtmp[idx], eltype);
3605     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3606   }
3607   // index += 32;
3608   addl(index, 32);
3609   // index < bound;
3610   cmpl(index, bound);
3611   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3612   // }
3613 
3614   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3615   subl(cnt1, bound);
3616   // release bound
3617 
3618   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3619   for (int idx = 0; idx < 4; idx++) {
3620     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3621     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3622     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3623   }
3624   // result += vresult.reduceLanes(ADD);
3625   for (int idx = 0; idx < 4; idx++) {
3626     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3627   }
3628 
3629   // } else if (cnt1 < 32) {
3630 
3631   bind(SHORT_UNROLLED_BEGIN);
3632   // int i = 1;
3633   movl(index, 1);
3634   cmpl(index, cnt1);
3635   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3636 
3637   // for (; i < cnt1 ; i += 2) {
3638   bind(SHORT_UNROLLED_LOOP_BEGIN);
3639   movl(tmp3, 961);
3640   imull(result, tmp3);
3641   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3642   movl(tmp3, tmp2);
3643   shll(tmp3, 5);
3644   subl(tmp3, tmp2);
3645   addl(result, tmp3);
3646   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3647   addl(result, tmp3);
3648   addl(index, 2);
3649   cmpl(index, cnt1);
3650   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3651 
3652   // }
3653   // if (i >= cnt1) {
3654   bind(SHORT_UNROLLED_LOOP_EXIT);
3655   jccb(Assembler::greater, END);
3656   movl(tmp2, result);
3657   shll(result, 5);
3658   subl(result, tmp2);
3659   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3660   addl(result, tmp3);
3661   // }
3662   bind(END);
3663 
3664   BLOCK_COMMENT("} // arrays_hashcode");
3665 
3666 } // arrays_hashcode
3667 
3668 // helper function for string_compare
3669 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3670                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3671                                            Address::ScaleFactor scale2, Register index, int ae) {
3672   if (ae == StrIntrinsicNode::LL) {
3673     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3674     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3675   } else if (ae == StrIntrinsicNode::UU) {
3676     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3677     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3678   } else {
3679     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3680     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3681   }
3682 }
3683 
3684 // Compare strings, used for char[] and byte[].
3685 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3686                                        Register cnt1, Register cnt2, Register result,
3687                                        XMMRegister vec1, int ae, KRegister mask) {
3688   ShortBranchVerifier sbv(this);
3689   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3690   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only AVX3
3691   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3692   int stride2x2 = 0x40;
3693   Address::ScaleFactor scale = Address::no_scale;
3694   Address::ScaleFactor scale1 = Address::no_scale;
3695   Address::ScaleFactor scale2 = Address::no_scale;
3696 
3697   if (ae != StrIntrinsicNode::LL) {
3698     stride2x2 = 0x20;
3699   }
3700 
3701   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3702     shrl(cnt2, 1);
3703   }
3704   // Compute the minimum of the string lengths and the
3705   // difference of the string lengths (stack).
3706   // Do the conditional move stuff
3707   movl(result, cnt1);
3708   subl(cnt1, cnt2);
3709   push(cnt1);
3710   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3711 
3712   // Is the minimum length zero?
3713   testl(cnt2, cnt2);
3714   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3715   if (ae == StrIntrinsicNode::LL) {
3716     // Load first bytes
3717     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3718     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3719   } else if (ae == StrIntrinsicNode::UU) {
3720     // Load first characters
3721     load_unsigned_short(result, Address(str1, 0));
3722     load_unsigned_short(cnt1, Address(str2, 0));
3723   } else {
3724     load_unsigned_byte(result, Address(str1, 0));
3725     load_unsigned_short(cnt1, Address(str2, 0));
3726   }
3727   subl(result, cnt1);
3728   jcc(Assembler::notZero,  POP_LABEL);
3729 
3730   if (ae == StrIntrinsicNode::UU) {
3731     // Divide length by 2 to get number of chars
3732     shrl(cnt2, 1);
3733   }
3734   cmpl(cnt2, 1);
3735   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3736 
3737   // Check if the strings start at the same location and setup scale and stride
3738   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3739     cmpptr(str1, str2);
3740     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3741     if (ae == StrIntrinsicNode::LL) {
3742       scale = Address::times_1;
3743       stride = 16;
3744     } else {
3745       scale = Address::times_2;
3746       stride = 8;
3747     }
3748   } else {
3749     scale1 = Address::times_1;
3750     scale2 = Address::times_2;
3751     // scale not used
3752     stride = 8;
3753   }
3754 
3755   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3756     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3757     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3758     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3759     Label COMPARE_TAIL_LONG;
3760     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only AVX3
3761 
3762     int pcmpmask = 0x19;
3763     if (ae == StrIntrinsicNode::LL) {
3764       pcmpmask &= ~0x01;
3765     }
3766 
3767     // Setup to compare 16-chars (32-bytes) vectors,
3768     // start from first character again because it has aligned address.
3769     if (ae == StrIntrinsicNode::LL) {
3770       stride2 = 32;
3771     } else {
3772       stride2 = 16;
3773     }
3774     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3775       adr_stride = stride << scale;
3776     } else {
3777       adr_stride1 = 8;  //stride << scale1;
3778       adr_stride2 = 16; //stride << scale2;
3779     }
3780 
3781     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3782     // rax and rdx are used by pcmpestri as elements counters
3783     movl(result, cnt2);
3784     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3785     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3786 
3787     // fast path : compare first 2 8-char vectors.
3788     bind(COMPARE_16_CHARS);
3789     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3790       movdqu(vec1, Address(str1, 0));
3791     } else {
3792       pmovzxbw(vec1, Address(str1, 0));
3793     }
3794     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3795     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3796 
3797     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3798       movdqu(vec1, Address(str1, adr_stride));
3799       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3800     } else {
3801       pmovzxbw(vec1, Address(str1, adr_stride1));
3802       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3803     }
3804     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3805     addl(cnt1, stride);
3806 
3807     // Compare the characters at index in cnt1
3808     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3809     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3810     subl(result, cnt2);
3811     jmp(POP_LABEL);
3812 
3813     // Setup the registers to start vector comparison loop
3814     bind(COMPARE_WIDE_VECTORS);
3815     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3816       lea(str1, Address(str1, result, scale));
3817       lea(str2, Address(str2, result, scale));
3818     } else {
3819       lea(str1, Address(str1, result, scale1));
3820       lea(str2, Address(str2, result, scale2));
3821     }
3822     subl(result, stride2);
3823     subl(cnt2, stride2);
3824     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3825     negptr(result);
3826 
3827     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3828     bind(COMPARE_WIDE_VECTORS_LOOP);
3829 
3830     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3831       cmpl(cnt2, stride2x2);
3832       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3833       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3834       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3835 
3836       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3837       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3838         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3839         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3840       } else {
3841         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3842         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3843       }
3844       kortestql(mask, mask);
3845       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3846       addptr(result, stride2x2);  // update since we already compared at this addr
3847       subl(cnt2, stride2x2);      // and sub the size too
3848       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3849 
3850       vpxor(vec1, vec1);
3851       jmpb(COMPARE_WIDE_TAIL);
3852     }//if (VM_Version::supports_avx512vlbw())
3853 
3854     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3855     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3856       vmovdqu(vec1, Address(str1, result, scale));
3857       vpxor(vec1, Address(str2, result, scale));
3858     } else {
3859       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3860       vpxor(vec1, Address(str2, result, scale2));
3861     }
3862     vptest(vec1, vec1);
3863     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3864     addptr(result, stride2);
3865     subl(cnt2, stride2);
3866     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3867     // clean upper bits of YMM registers
3868     vpxor(vec1, vec1);
3869 
3870     // compare wide vectors tail
3871     bind(COMPARE_WIDE_TAIL);
3872     testptr(result, result);
3873     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3874 
3875     movl(result, stride2);
3876     movl(cnt2, result);
3877     negptr(result);
3878     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3879 
3880     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3881     bind(VECTOR_NOT_EQUAL);
3882     // clean upper bits of YMM registers
3883     vpxor(vec1, vec1);
3884     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3885       lea(str1, Address(str1, result, scale));
3886       lea(str2, Address(str2, result, scale));
3887     } else {
3888       lea(str1, Address(str1, result, scale1));
3889       lea(str2, Address(str2, result, scale2));
3890     }
3891     jmp(COMPARE_16_CHARS);
3892 
3893     // Compare tail chars, length between 1 to 15 chars
3894     bind(COMPARE_TAIL_LONG);
3895     movl(cnt2, result);
3896     cmpl(cnt2, stride);
3897     jcc(Assembler::less, COMPARE_SMALL_STR);
3898 
3899     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3900       movdqu(vec1, Address(str1, 0));
3901     } else {
3902       pmovzxbw(vec1, Address(str1, 0));
3903     }
3904     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3905     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3906     subptr(cnt2, stride);
3907     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3908     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3909       lea(str1, Address(str1, result, scale));
3910       lea(str2, Address(str2, result, scale));
3911     } else {
3912       lea(str1, Address(str1, result, scale1));
3913       lea(str2, Address(str2, result, scale2));
3914     }
3915     negptr(cnt2);
3916     jmpb(WHILE_HEAD_LABEL);
3917 
3918     bind(COMPARE_SMALL_STR);
3919   } else if (UseSSE42Intrinsics) {
3920     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3921     int pcmpmask = 0x19;
3922     // Setup to compare 8-char (16-byte) vectors,
3923     // start from first character again because it has aligned address.
3924     movl(result, cnt2);
3925     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3926     if (ae == StrIntrinsicNode::LL) {
3927       pcmpmask &= ~0x01;
3928     }
3929     jcc(Assembler::zero, COMPARE_TAIL);
3930     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3931       lea(str1, Address(str1, result, scale));
3932       lea(str2, Address(str2, result, scale));
3933     } else {
3934       lea(str1, Address(str1, result, scale1));
3935       lea(str2, Address(str2, result, scale2));
3936     }
3937     negptr(result);
3938 
3939     // pcmpestri
3940     //   inputs:
3941     //     vec1- substring
3942     //     rax - negative string length (elements count)
3943     //     mem - scanned string
3944     //     rdx - string length (elements count)
3945     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3946     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3947     //   outputs:
3948     //     rcx - first mismatched element index
3949     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3950 
3951     bind(COMPARE_WIDE_VECTORS);
3952     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3953       movdqu(vec1, Address(str1, result, scale));
3954       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3955     } else {
3956       pmovzxbw(vec1, Address(str1, result, scale1));
3957       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3958     }
3959     // After pcmpestri cnt1(rcx) contains mismatched element index
3960 
3961     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3962     addptr(result, stride);
3963     subptr(cnt2, stride);
3964     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3965 
3966     // compare wide vectors tail
3967     testptr(result, result);
3968     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3969 
3970     movl(cnt2, stride);
3971     movl(result, stride);
3972     negptr(result);
3973     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3974       movdqu(vec1, Address(str1, result, scale));
3975       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3976     } else {
3977       pmovzxbw(vec1, Address(str1, result, scale1));
3978       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3979     }
3980     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3981 
3982     // Mismatched characters in the vectors
3983     bind(VECTOR_NOT_EQUAL);
3984     addptr(cnt1, result);
3985     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3986     subl(result, cnt2);
3987     jmpb(POP_LABEL);
3988 
3989     bind(COMPARE_TAIL); // limit is zero
3990     movl(cnt2, result);
3991     // Fallthru to tail compare
3992   }
3993   // Shift str2 and str1 to the end of the arrays, negate min
3994   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3995     lea(str1, Address(str1, cnt2, scale));
3996     lea(str2, Address(str2, cnt2, scale));
3997   } else {
3998     lea(str1, Address(str1, cnt2, scale1));
3999     lea(str2, Address(str2, cnt2, scale2));
4000   }
4001   decrementl(cnt2);  // first character was compared already
4002   negptr(cnt2);
4003 
4004   // Compare the rest of the elements
4005   bind(WHILE_HEAD_LABEL);
4006   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
4007   subl(result, cnt1);
4008   jccb(Assembler::notZero, POP_LABEL);
4009   increment(cnt2);
4010   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
4011 
4012   // Strings are equal up to min length.  Return the length difference.
4013   bind(LENGTH_DIFF_LABEL);
4014   pop(result);
4015   if (ae == StrIntrinsicNode::UU) {
4016     // Divide diff by 2 to get number of chars
4017     sarl(result, 1);
4018   }
4019   jmpb(DONE_LABEL);
4020 
4021   if (VM_Version::supports_avx512vlbw()) {
4022 
4023     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
4024 
4025     kmovql(cnt1, mask);
4026     notq(cnt1);
4027     bsfq(cnt2, cnt1);
4028     if (ae != StrIntrinsicNode::LL) {
4029       // Divide diff by 2 to get number of chars
4030       sarl(cnt2, 1);
4031     }
4032     addq(result, cnt2);
4033     if (ae == StrIntrinsicNode::LL) {
4034       load_unsigned_byte(cnt1, Address(str2, result));
4035       load_unsigned_byte(result, Address(str1, result));
4036     } else if (ae == StrIntrinsicNode::UU) {
4037       load_unsigned_short(cnt1, Address(str2, result, scale));
4038       load_unsigned_short(result, Address(str1, result, scale));
4039     } else {
4040       load_unsigned_short(cnt1, Address(str2, result, scale2));
4041       load_unsigned_byte(result, Address(str1, result, scale1));
4042     }
4043     subl(result, cnt1);
4044     jmpb(POP_LABEL);
4045   }//if (VM_Version::supports_avx512vlbw())
4046 
4047   // Discard the stored length difference
4048   bind(POP_LABEL);
4049   pop(cnt1);
4050 
4051   // That's it
4052   bind(DONE_LABEL);
4053   if(ae == StrIntrinsicNode::UL) {
4054     negl(result);
4055   }
4056 
4057 }
4058 
4059 // Search for Non-ASCII character (Negative byte value) in a byte array,
4060 // return the index of the first such character, otherwise the length
4061 // of the array segment searched.
4062 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4063 //   @IntrinsicCandidate
4064 //   public static int countPositives(byte[] ba, int off, int len) {
4065 //     for (int i = off; i < off + len; i++) {
4066 //       if (ba[i] < 0) {
4067 //         return i - off;
4068 //       }
4069 //     }
4070 //     return len;
4071 //   }
4072 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4073   Register result, Register tmp1,
4074   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4075   // rsi: byte array
4076   // rcx: len
4077   // rax: result
4078   ShortBranchVerifier sbv(this);
4079   assert_different_registers(ary1, len, result, tmp1);
4080   assert_different_registers(vec1, vec2);
4081   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4082 
4083   movl(result, len); // copy
4084   // len == 0
4085   testl(len, len);
4086   jcc(Assembler::zero, DONE);
4087 
4088   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4089     VM_Version::supports_avx512vlbw() &&
4090     VM_Version::supports_bmi2()) {
4091 
4092     Label test_64_loop, test_tail, BREAK_LOOP;
4093     movl(tmp1, len);
4094     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4095 
4096     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4097     andl(len,  0xffffffc0); // vector count (in chars)
4098     jccb(Assembler::zero, test_tail);
4099 
4100     lea(ary1, Address(ary1, len, Address::times_1));
4101     negptr(len);
4102 
4103     bind(test_64_loop);
4104     // Check whether our 64 elements of size byte contain negatives
4105     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4106     kortestql(mask1, mask1);
4107     jcc(Assembler::notZero, BREAK_LOOP);
4108 
4109     addptr(len, 64);
4110     jccb(Assembler::notZero, test_64_loop);
4111 
4112     bind(test_tail);
4113     // bail out when there is nothing to be done
4114     testl(tmp1, -1);
4115     jcc(Assembler::zero, DONE);
4116 
4117 
4118     // check the tail for absense of negatives
4119     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4120     {
4121       Register tmp3_aliased = len;
4122       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4123       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4124       notq(tmp3_aliased);
4125       kmovql(mask2, tmp3_aliased);
4126     }
4127 
4128     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4129     ktestq(mask1, mask2);
4130     jcc(Assembler::zero, DONE);
4131 
4132     // do a full check for negative registers in the tail
4133     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4134                      // ary1 already pointing to the right place
4135     jmpb(TAIL_START);
4136 
4137     bind(BREAK_LOOP);
4138     // At least one byte in the last 64 byte block was negative.
4139     // Set up to look at the last 64 bytes as if they were a tail
4140     lea(ary1, Address(ary1, len, Address::times_1));
4141     addptr(result, len);
4142     // Ignore the very last byte: if all others are positive,
4143     // it must be negative, so we can skip right to the 2+1 byte
4144     // end comparison at this point
4145     orl(result, 63);
4146     movl(len, 63);
4147     // Fallthru to tail compare
4148   } else {
4149 
4150     if (UseAVX >= 2) {
4151       // With AVX2, use 32-byte vector compare
4152       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4153 
4154       // Compare 32-byte vectors
4155       testl(len, 0xffffffe0);   // vector count (in bytes)
4156       jccb(Assembler::zero, TAIL_START);
4157 
4158       andl(len, 0xffffffe0);
4159       lea(ary1, Address(ary1, len, Address::times_1));
4160       negptr(len);
4161 
4162       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4163       movdl(vec2, tmp1);
4164       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4165 
4166       bind(COMPARE_WIDE_VECTORS);
4167       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4168       vptest(vec1, vec2);
4169       jccb(Assembler::notZero, BREAK_LOOP);
4170       addptr(len, 32);
4171       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4172 
4173       testl(result, 0x0000001f);   // any bytes remaining?
4174       jcc(Assembler::zero, DONE);
4175 
4176       // Quick test using the already prepared vector mask
4177       movl(len, result);
4178       andl(len, 0x0000001f);
4179       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4180       vptest(vec1, vec2);
4181       jcc(Assembler::zero, DONE);
4182       // There are zeros, jump to the tail to determine exactly where
4183       jmpb(TAIL_START);
4184 
4185       bind(BREAK_LOOP);
4186       // At least one byte in the last 32-byte vector is negative.
4187       // Set up to look at the last 32 bytes as if they were a tail
4188       lea(ary1, Address(ary1, len, Address::times_1));
4189       addptr(result, len);
4190       // Ignore the very last byte: if all others are positive,
4191       // it must be negative, so we can skip right to the 2+1 byte
4192       // end comparison at this point
4193       orl(result, 31);
4194       movl(len, 31);
4195       // Fallthru to tail compare
4196     } else if (UseSSE42Intrinsics) {
4197       // With SSE4.2, use double quad vector compare
4198       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4199 
4200       // Compare 16-byte vectors
4201       testl(len, 0xfffffff0);   // vector count (in bytes)
4202       jcc(Assembler::zero, TAIL_START);
4203 
4204       andl(len, 0xfffffff0);
4205       lea(ary1, Address(ary1, len, Address::times_1));
4206       negptr(len);
4207 
4208       movl(tmp1, 0x80808080);
4209       movdl(vec2, tmp1);
4210       pshufd(vec2, vec2, 0);
4211 
4212       bind(COMPARE_WIDE_VECTORS);
4213       movdqu(vec1, Address(ary1, len, Address::times_1));
4214       ptest(vec1, vec2);
4215       jccb(Assembler::notZero, BREAK_LOOP);
4216       addptr(len, 16);
4217       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4218 
4219       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4220       jcc(Assembler::zero, DONE);
4221 
4222       // Quick test using the already prepared vector mask
4223       movl(len, result);
4224       andl(len, 0x0000000f);   // tail count (in bytes)
4225       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4226       ptest(vec1, vec2);
4227       jcc(Assembler::zero, DONE);
4228       jmpb(TAIL_START);
4229 
4230       bind(BREAK_LOOP);
4231       // At least one byte in the last 16-byte vector is negative.
4232       // Set up and look at the last 16 bytes as if they were a tail
4233       lea(ary1, Address(ary1, len, Address::times_1));
4234       addptr(result, len);
4235       // Ignore the very last byte: if all others are positive,
4236       // it must be negative, so we can skip right to the 2+1 byte
4237       // end comparison at this point
4238       orl(result, 15);
4239       movl(len, 15);
4240       // Fallthru to tail compare
4241     }
4242   }
4243 
4244   bind(TAIL_START);
4245   // Compare 4-byte vectors
4246   andl(len, 0xfffffffc); // vector count (in bytes)
4247   jccb(Assembler::zero, COMPARE_CHAR);
4248 
4249   lea(ary1, Address(ary1, len, Address::times_1));
4250   negptr(len);
4251 
4252   bind(COMPARE_VECTORS);
4253   movl(tmp1, Address(ary1, len, Address::times_1));
4254   andl(tmp1, 0x80808080);
4255   jccb(Assembler::notZero, TAIL_ADJUST);
4256   addptr(len, 4);
4257   jccb(Assembler::notZero, COMPARE_VECTORS);
4258 
4259   // Compare trailing char (final 2-3 bytes), if any
4260   bind(COMPARE_CHAR);
4261 
4262   testl(result, 0x2);   // tail  char
4263   jccb(Assembler::zero, COMPARE_BYTE);
4264   load_unsigned_short(tmp1, Address(ary1, 0));
4265   andl(tmp1, 0x00008080);
4266   jccb(Assembler::notZero, CHAR_ADJUST);
4267   lea(ary1, Address(ary1, 2));
4268 
4269   bind(COMPARE_BYTE);
4270   testl(result, 0x1);   // tail  byte
4271   jccb(Assembler::zero, DONE);
4272   load_unsigned_byte(tmp1, Address(ary1, 0));
4273   testl(tmp1, 0x00000080);
4274   jccb(Assembler::zero, DONE);
4275   subptr(result, 1);
4276   jmpb(DONE);
4277 
4278   bind(TAIL_ADJUST);
4279   // there are negative bits in the last 4 byte block.
4280   // Adjust result and check the next three bytes
4281   addptr(result, len);
4282   orl(result, 3);
4283   lea(ary1, Address(ary1, len, Address::times_1));
4284   jmpb(COMPARE_CHAR);
4285 
4286   bind(CHAR_ADJUST);
4287   // We are looking at a char + optional byte tail, and found that one
4288   // of the bytes in the char is negative. Adjust the result, check the
4289   // first byte and readjust if needed.
4290   andl(result, 0xfffffffc);
4291   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4292   jccb(Assembler::notZero, DONE);
4293   addptr(result, 1);
4294 
4295   // That's it
4296   bind(DONE);
4297   if (UseAVX >= 2) {
4298     // clean upper bits of YMM registers
4299     vpxor(vec1, vec1);
4300     vpxor(vec2, vec2);
4301   }
4302 }
4303 
4304 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4305 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4306                                       Register limit, Register result, Register chr,
4307                                       XMMRegister vec1, XMMRegister vec2, bool is_char,
4308                                       KRegister mask, bool expand_ary2) {
4309   // for expand_ary2, limit is the (smaller) size of the second array.
4310   ShortBranchVerifier sbv(this);
4311   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4312 
4313   assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4314          "Expansion only implemented for AVX2");
4315 
4316   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4317   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4318 
4319   Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4320   int scaleIncr = expand_ary2 ? 8 : 16;
4321 
4322   if (is_array_equ) {
4323     // Check the input args
4324     cmpoop(ary1, ary2);
4325     jcc(Assembler::equal, TRUE_LABEL);
4326 
4327     // Need additional checks for arrays_equals.
4328     testptr(ary1, ary1);
4329     jcc(Assembler::zero, FALSE_LABEL);
4330     testptr(ary2, ary2);
4331     jcc(Assembler::zero, FALSE_LABEL);
4332 
4333     // Check the lengths
4334     movl(limit, Address(ary1, length_offset));
4335     cmpl(limit, Address(ary2, length_offset));
4336     jcc(Assembler::notEqual, FALSE_LABEL);
4337   }
4338 
4339   // count == 0
4340   testl(limit, limit);
4341   jcc(Assembler::zero, TRUE_LABEL);
4342 
4343   if (is_array_equ) {
4344     // Load array address
4345     lea(ary1, Address(ary1, base_offset));
4346     lea(ary2, Address(ary2, base_offset));
4347   }
4348 
4349   if (is_array_equ && is_char) {
4350     // arrays_equals when used for char[].
4351     shll(limit, 1);      // byte count != 0
4352   }
4353   movl(result, limit); // copy
4354 
4355   if (UseAVX >= 2) {
4356     // With AVX2, use 32-byte vector compare
4357     Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4358 
4359     // Compare 32-byte vectors
4360     if (expand_ary2) {
4361       andl(result, 0x0000000f);  //   tail count (in bytes)
4362       andl(limit, 0xfffffff0);   // vector count (in bytes)
4363       jcc(Assembler::zero, COMPARE_TAIL);
4364     } else {
4365       andl(result, 0x0000001f);  //   tail count (in bytes)
4366       andl(limit, 0xffffffe0);   // vector count (in bytes)
4367       jcc(Assembler::zero, COMPARE_TAIL_16);
4368     }
4369 
4370     lea(ary1, Address(ary1, limit, scaleFactor));
4371     lea(ary2, Address(ary2, limit, Address::times_1));
4372     negptr(limit);
4373 
4374     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4375       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4376 
4377       cmpl(limit, -64);
4378       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4379 
4380       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4381 
4382       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4383       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4384       kortestql(mask, mask);
4385       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4386       addptr(limit, 64);  // update since we already compared at this addr
4387       cmpl(limit, -64);
4388       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4389 
4390       // At this point we may still need to compare -limit+result bytes.
4391       // We could execute the next two instruction and just continue via non-wide path:
4392       //  cmpl(limit, 0);
4393       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4394       // But since we stopped at the points ary{1,2}+limit which are
4395       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4396       // (|limit| <= 32 and result < 32),
4397       // we may just compare the last 64 bytes.
4398       //
4399       addptr(result, -64);   // it is safe, bc we just came from this area
4400       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4401       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4402       kortestql(mask, mask);
4403       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4404 
4405       jmp(TRUE_LABEL);
4406 
4407       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4408 
4409     }//if (VM_Version::supports_avx512vlbw())
4410 
4411     bind(COMPARE_WIDE_VECTORS);
4412     vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4413     if (expand_ary2) {
4414       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4415     } else {
4416       vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4417     }
4418     vpxor(vec1, vec2);
4419 
4420     vptest(vec1, vec1);
4421     jcc(Assembler::notZero, FALSE_LABEL);
4422     addptr(limit, scaleIncr * 2);
4423     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4424 
4425     testl(result, result);
4426     jcc(Assembler::zero, TRUE_LABEL);
4427 
4428     vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4429     if (expand_ary2) {
4430       vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4431     } else {
4432       vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4433     }
4434     vpxor(vec1, vec2);
4435 
4436     vptest(vec1, vec1);
4437     jcc(Assembler::notZero, FALSE_LABEL);
4438     jmp(TRUE_LABEL);
4439 
4440     bind(COMPARE_TAIL_16); // limit is zero
4441     movl(limit, result);
4442 
4443     // Compare 16-byte chunks
4444     andl(result, 0x0000000f);  //   tail count (in bytes)
4445     andl(limit, 0xfffffff0);   // vector count (in bytes)
4446     jcc(Assembler::zero, COMPARE_TAIL);
4447 
4448     lea(ary1, Address(ary1, limit, scaleFactor));
4449     lea(ary2, Address(ary2, limit, Address::times_1));
4450     negptr(limit);
4451 
4452     bind(COMPARE_WIDE_VECTORS_16);
4453     movdqu(vec1, Address(ary1, limit, scaleFactor));
4454     if (expand_ary2) {
4455       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4456     } else {
4457       movdqu(vec2, Address(ary2, limit, Address::times_1));
4458     }
4459     pxor(vec1, vec2);
4460 
4461     ptest(vec1, vec1);
4462     jcc(Assembler::notZero, FALSE_LABEL);
4463     addptr(limit, scaleIncr);
4464     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4465 
4466     bind(COMPARE_TAIL); // limit is zero
4467     movl(limit, result);
4468     // Fallthru to tail compare
4469   } else if (UseSSE42Intrinsics) {
4470     // With SSE4.2, use double quad vector compare
4471     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4472 
4473     // Compare 16-byte vectors
4474     andl(result, 0x0000000f);  //   tail count (in bytes)
4475     andl(limit, 0xfffffff0);   // vector count (in bytes)
4476     jcc(Assembler::zero, COMPARE_TAIL);
4477 
4478     lea(ary1, Address(ary1, limit, Address::times_1));
4479     lea(ary2, Address(ary2, limit, Address::times_1));
4480     negptr(limit);
4481 
4482     bind(COMPARE_WIDE_VECTORS);
4483     movdqu(vec1, Address(ary1, limit, Address::times_1));
4484     movdqu(vec2, Address(ary2, limit, Address::times_1));
4485     pxor(vec1, vec2);
4486 
4487     ptest(vec1, vec1);
4488     jcc(Assembler::notZero, FALSE_LABEL);
4489     addptr(limit, 16);
4490     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4491 
4492     testl(result, result);
4493     jcc(Assembler::zero, TRUE_LABEL);
4494 
4495     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4496     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4497     pxor(vec1, vec2);
4498 
4499     ptest(vec1, vec1);
4500     jccb(Assembler::notZero, FALSE_LABEL);
4501     jmpb(TRUE_LABEL);
4502 
4503     bind(COMPARE_TAIL); // limit is zero
4504     movl(limit, result);
4505     // Fallthru to tail compare
4506   }
4507 
4508   // Compare 4-byte vectors
4509   if (expand_ary2) {
4510     testl(result, result);
4511     jccb(Assembler::zero, TRUE_LABEL);
4512   } else {
4513     andl(limit, 0xfffffffc); // vector count (in bytes)
4514     jccb(Assembler::zero, COMPARE_CHAR);
4515   }
4516 
4517   lea(ary1, Address(ary1, limit, scaleFactor));
4518   lea(ary2, Address(ary2, limit, Address::times_1));
4519   negptr(limit);
4520 
4521   bind(COMPARE_VECTORS);
4522   if (expand_ary2) {
4523     // There are no "vector" operations for bytes to shorts
4524     movzbl(chr, Address(ary2, limit, Address::times_1));
4525     cmpw(Address(ary1, limit, Address::times_2), chr);
4526     jccb(Assembler::notEqual, FALSE_LABEL);
4527     addptr(limit, 1);
4528     jcc(Assembler::notZero, COMPARE_VECTORS);
4529     jmp(TRUE_LABEL);
4530   } else {
4531     movl(chr, Address(ary1, limit, Address::times_1));
4532     cmpl(chr, Address(ary2, limit, Address::times_1));
4533     jccb(Assembler::notEqual, FALSE_LABEL);
4534     addptr(limit, 4);
4535     jcc(Assembler::notZero, COMPARE_VECTORS);
4536   }
4537 
4538   // Compare trailing char (final 2 bytes), if any
4539   bind(COMPARE_CHAR);
4540   testl(result, 0x2);   // tail  char
4541   jccb(Assembler::zero, COMPARE_BYTE);
4542   load_unsigned_short(chr, Address(ary1, 0));
4543   load_unsigned_short(limit, Address(ary2, 0));
4544   cmpl(chr, limit);
4545   jccb(Assembler::notEqual, FALSE_LABEL);
4546 
4547   if (is_array_equ && is_char) {
4548     bind(COMPARE_BYTE);
4549   } else {
4550     lea(ary1, Address(ary1, 2));
4551     lea(ary2, Address(ary2, 2));
4552 
4553     bind(COMPARE_BYTE);
4554     testl(result, 0x1);   // tail  byte
4555     jccb(Assembler::zero, TRUE_LABEL);
4556     load_unsigned_byte(chr, Address(ary1, 0));
4557     load_unsigned_byte(limit, Address(ary2, 0));
4558     cmpl(chr, limit);
4559     jccb(Assembler::notEqual, FALSE_LABEL);
4560   }
4561   bind(TRUE_LABEL);
4562   movl(result, 1);   // return true
4563   jmpb(DONE);
4564 
4565   bind(FALSE_LABEL);
4566   xorl(result, result); // return false
4567 
4568   // That's it
4569   bind(DONE);
4570   if (UseAVX >= 2) {
4571     // clean upper bits of YMM registers
4572     vpxor(vec1, vec1);
4573     vpxor(vec2, vec2);
4574   }
4575 }
4576 
4577 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4578 #define __ masm.
4579   Register dst = stub.data<0>();
4580   XMMRegister src = stub.data<1>();
4581   address target = stub.data<2>();
4582   __ bind(stub.entry());
4583   __ subptr(rsp, 8);
4584   __ movdbl(Address(rsp), src);
4585   __ call(RuntimeAddress(target));
4586   // APX REX2 encoding for pop(dst) increases the stub size by 1 byte.
4587   __ pop(dst);
4588   __ jmp(stub.continuation());
4589 #undef __
4590 }
4591 
4592 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4593   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4594   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4595 
4596   address slowpath_target;
4597   if (dst_bt == T_INT) {
4598     if (src_bt == T_FLOAT) {
4599       cvttss2sil(dst, src);
4600       cmpl(dst, 0x80000000);
4601       slowpath_target = StubRoutines::x86::f2i_fixup();
4602     } else {
4603       cvttsd2sil(dst, src);
4604       cmpl(dst, 0x80000000);
4605       slowpath_target = StubRoutines::x86::d2i_fixup();
4606     }
4607   } else {
4608     if (src_bt == T_FLOAT) {
4609       cvttss2siq(dst, src);
4610       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4611       slowpath_target = StubRoutines::x86::f2l_fixup();
4612     } else {
4613       cvttsd2siq(dst, src);
4614       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4615       slowpath_target = StubRoutines::x86::d2l_fixup();
4616     }
4617   }
4618 
4619   // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte.
4620   int max_size = 23 + (UseAPX ? 1 : 0);
4621   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath);
4622   jcc(Assembler::equal, stub->entry());
4623   bind(stub->continuation());
4624 }
4625 
4626 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4627                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4628   switch(ideal_opc) {
4629     case Op_LShiftVS:
4630       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4631     case Op_LShiftVI:
4632       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4633     case Op_LShiftVL:
4634       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4635     case Op_RShiftVS:
4636       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4637     case Op_RShiftVI:
4638       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4639     case Op_RShiftVL:
4640       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4641     case Op_URShiftVS:
4642       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4643     case Op_URShiftVI:
4644       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4645     case Op_URShiftVL:
4646       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4647     case Op_RotateRightV:
4648       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4649     case Op_RotateLeftV:
4650       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4651     default:
4652       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4653       break;
4654   }
4655 }
4656 
4657 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4658                                                XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4659   if (is_unsigned) {
4660     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4661   } else {
4662     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4663   }
4664 }
4665 
4666 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4667                                                       XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4668   switch (elem_bt) {
4669     case T_BYTE:
4670       if (ideal_opc == Op_SaturatingAddV) {
4671         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4672       } else {
4673         assert(ideal_opc == Op_SaturatingSubV, "");
4674         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4675       }
4676       break;
4677     case T_SHORT:
4678       if (ideal_opc == Op_SaturatingAddV) {
4679         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4680       } else {
4681         assert(ideal_opc == Op_SaturatingSubV, "");
4682         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4683       }
4684       break;
4685     default:
4686       fatal("Unsupported type %s", type2name(elem_bt));
4687       break;
4688   }
4689 }
4690 
4691 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4692                                                         XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4693   switch (elem_bt) {
4694     case T_BYTE:
4695       if (ideal_opc == Op_SaturatingAddV) {
4696         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4697       } else {
4698         assert(ideal_opc == Op_SaturatingSubV, "");
4699         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4700       }
4701       break;
4702     case T_SHORT:
4703       if (ideal_opc == Op_SaturatingAddV) {
4704         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4705       } else {
4706         assert(ideal_opc == Op_SaturatingSubV, "");
4707         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4708       }
4709       break;
4710     default:
4711       fatal("Unsupported type %s", type2name(elem_bt));
4712       break;
4713   }
4714 }
4715 
4716 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4717                                                Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4718   if (is_unsigned) {
4719     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4720   } else {
4721     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4722   }
4723 }
4724 
4725 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4726                                                       XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4727   switch (elem_bt) {
4728     case T_BYTE:
4729       if (ideal_opc == Op_SaturatingAddV) {
4730         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4731       } else {
4732         assert(ideal_opc == Op_SaturatingSubV, "");
4733         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4734       }
4735       break;
4736     case T_SHORT:
4737       if (ideal_opc == Op_SaturatingAddV) {
4738         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4739       } else {
4740         assert(ideal_opc == Op_SaturatingSubV, "");
4741         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4742       }
4743       break;
4744     default:
4745       fatal("Unsupported type %s", type2name(elem_bt));
4746       break;
4747   }
4748 }
4749 
4750 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4751                                                         XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4752   switch (elem_bt) {
4753     case T_BYTE:
4754       if (ideal_opc == Op_SaturatingAddV) {
4755         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4756       } else {
4757         assert(ideal_opc == Op_SaturatingSubV, "");
4758         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4759       }
4760       break;
4761     case T_SHORT:
4762       if (ideal_opc == Op_SaturatingAddV) {
4763         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4764       } else {
4765         assert(ideal_opc == Op_SaturatingSubV, "");
4766         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4767       }
4768       break;
4769     default:
4770       fatal("Unsupported type %s", type2name(elem_bt));
4771       break;
4772   }
4773 }
4774 
4775 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4776                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4777                                     bool is_varshift) {
4778   switch (ideal_opc) {
4779     case Op_AddVB:
4780       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4781     case Op_AddVS:
4782       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4783     case Op_AddVI:
4784       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4785     case Op_AddVL:
4786       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4787     case Op_AddVF:
4788       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4789     case Op_AddVD:
4790       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4791     case Op_SubVB:
4792       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4793     case Op_SubVS:
4794       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4795     case Op_SubVI:
4796       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4797     case Op_SubVL:
4798       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4799     case Op_SubVF:
4800       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4801     case Op_SubVD:
4802       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4803     case Op_MulVS:
4804       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4805     case Op_MulVI:
4806       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4807     case Op_MulVL:
4808       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4809     case Op_MulVF:
4810       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4811     case Op_MulVD:
4812       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4813     case Op_DivVF:
4814       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4815     case Op_DivVD:
4816       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4817     case Op_SqrtVF:
4818       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4819     case Op_SqrtVD:
4820       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4821     case Op_AbsVB:
4822       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4823     case Op_AbsVS:
4824       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4825     case Op_AbsVI:
4826       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4827     case Op_AbsVL:
4828       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4829     case Op_FmaVF:
4830       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4831     case Op_FmaVD:
4832       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4833     case Op_VectorRearrange:
4834       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4835     case Op_LShiftVS:
4836       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4837     case Op_LShiftVI:
4838       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4839     case Op_LShiftVL:
4840       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4841     case Op_RShiftVS:
4842       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4843     case Op_RShiftVI:
4844       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4845     case Op_RShiftVL:
4846       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4847     case Op_URShiftVS:
4848       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4849     case Op_URShiftVI:
4850       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4851     case Op_URShiftVL:
4852       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4853     case Op_RotateLeftV:
4854       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4855     case Op_RotateRightV:
4856       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4857     case Op_MaxV:
4858       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4859     case Op_MinV:
4860       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4861     case Op_UMinV:
4862       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4863     case Op_UMaxV:
4864       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4865     case Op_XorV:
4866       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4867     case Op_OrV:
4868       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4869     case Op_AndV:
4870       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4871     default:
4872       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4873       break;
4874   }
4875 }
4876 
4877 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4878                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4879   switch (ideal_opc) {
4880     case Op_AddVB:
4881       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4882     case Op_AddVS:
4883       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4884     case Op_AddVI:
4885       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4886     case Op_AddVL:
4887       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4888     case Op_AddVF:
4889       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4890     case Op_AddVD:
4891       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4892     case Op_SubVB:
4893       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4894     case Op_SubVS:
4895       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4896     case Op_SubVI:
4897       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4898     case Op_SubVL:
4899       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4900     case Op_SubVF:
4901       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4902     case Op_SubVD:
4903       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4904     case Op_MulVS:
4905       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4906     case Op_MulVI:
4907       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4908     case Op_MulVL:
4909       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4910     case Op_MulVF:
4911       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4912     case Op_MulVD:
4913       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4914     case Op_DivVF:
4915       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4916     case Op_DivVD:
4917       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4918     case Op_FmaVF:
4919       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4920     case Op_FmaVD:
4921       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4922     case Op_MaxV:
4923       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4924     case Op_MinV:
4925       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4926     case Op_UMaxV:
4927       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4928     case Op_UMinV:
4929       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4930     case Op_XorV:
4931       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4932     case Op_OrV:
4933       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4934     case Op_AndV:
4935       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4936     default:
4937       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4938       break;
4939   }
4940 }
4941 
4942 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4943                                   KRegister src1, KRegister src2) {
4944   BasicType etype = T_ILLEGAL;
4945   switch(mask_len) {
4946     case 2:
4947     case 4:
4948     case 8:  etype = T_BYTE; break;
4949     case 16: etype = T_SHORT; break;
4950     case 32: etype = T_INT; break;
4951     case 64: etype = T_LONG; break;
4952     default: fatal("Unsupported type"); break;
4953   }
4954   assert(etype != T_ILLEGAL, "");
4955   switch(ideal_opc) {
4956     case Op_AndVMask:
4957       kand(etype, dst, src1, src2); break;
4958     case Op_OrVMask:
4959       kor(etype, dst, src1, src2); break;
4960     case Op_XorVMask:
4961       kxor(etype, dst, src1, src2); break;
4962     default:
4963       fatal("Unsupported masked operation"); break;
4964   }
4965 }
4966 
4967 /*
4968  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4969  * If src is NaN, the result is 0.
4970  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4971  * the result is equal to the value of Integer.MIN_VALUE.
4972  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4973  * the result is equal to the value of Integer.MAX_VALUE.
4974  */
4975 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4976                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4977                                                                    Register rscratch, AddressLiteral float_sign_flip,
4978                                                                    int vec_enc) {
4979   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4980   Label done;
4981   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4982   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4983   vptest(xtmp2, xtmp2, vec_enc);
4984   jccb(Assembler::equal, done);
4985 
4986   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4987   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4988 
4989   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4990   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4991   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4992 
4993   // Recompute the mask for remaining special value.
4994   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4995   // Extract SRC values corresponding to TRUE mask lanes.
4996   vpand(xtmp4, xtmp2, src, vec_enc);
4997   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4998   // values are set.
4999   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
5000 
5001   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
5002   bind(done);
5003 }
5004 
5005 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5006                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5007                                                                     Register rscratch, AddressLiteral float_sign_flip,
5008                                                                     int vec_enc) {
5009   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5010   Label done;
5011   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5012   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5013   kortestwl(ktmp1, ktmp1);
5014   jccb(Assembler::equal, done);
5015 
5016   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5017   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5018   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5019 
5020   kxorwl(ktmp1, ktmp1, ktmp2);
5021   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5022   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5023   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5024   bind(done);
5025 }
5026 
5027 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5028                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5029                                                                      Register rscratch, AddressLiteral double_sign_flip,
5030                                                                      int vec_enc) {
5031   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5032 
5033   Label done;
5034   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5035   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
5036   kortestwl(ktmp1, ktmp1);
5037   jccb(Assembler::equal, done);
5038 
5039   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5040   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5041   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5042 
5043   kxorwl(ktmp1, ktmp1, ktmp2);
5044   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5045   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5046   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5047   bind(done);
5048 }
5049 
5050 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5051                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5052                                                                      Register rscratch, AddressLiteral float_sign_flip,
5053                                                                      int vec_enc) {
5054   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5055   Label done;
5056   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5057   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5058   kortestwl(ktmp1, ktmp1);
5059   jccb(Assembler::equal, done);
5060 
5061   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5062   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5063   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5064 
5065   kxorwl(ktmp1, ktmp1, ktmp2);
5066   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5067   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5068   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5069   bind(done);
5070 }
5071 
5072 /*
5073  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5074  * If src is NaN, the result is 0.
5075  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
5076  * the result is equal to the value of Long.MIN_VALUE.
5077  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
5078  * the result is equal to the value of Long.MAX_VALUE.
5079  */
5080 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5081                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5082                                                                       Register rscratch, AddressLiteral double_sign_flip,
5083                                                                       int vec_enc) {
5084   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5085 
5086   Label done;
5087   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5088   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
5089   kortestwl(ktmp1, ktmp1);
5090   jccb(Assembler::equal, done);
5091 
5092   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5093   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5094   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5095 
5096   kxorwl(ktmp1, ktmp1, ktmp2);
5097   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5098   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5099   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5100   bind(done);
5101 }
5102 
5103 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5104                                                              XMMRegister xtmp, int index, int vec_enc) {
5105    assert(vec_enc < Assembler::AVX_512bit, "");
5106    if (vec_enc == Assembler::AVX_256bit) {
5107      vextractf128_high(xtmp, src);
5108      vshufps(dst, src, xtmp, index, vec_enc);
5109    } else {
5110      vshufps(dst, src, zero, index, vec_enc);
5111    }
5112 }
5113 
5114 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5115                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5116                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
5117   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5118 
5119   Label done;
5120   // Compare the destination lanes with float_sign_flip
5121   // value to get mask for all special values.
5122   movdqu(xtmp1, float_sign_flip, rscratch);
5123   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5124   ptest(xtmp2, xtmp2);
5125   jccb(Assembler::equal, done);
5126 
5127   // Flip float_sign_flip to get max integer value.
5128   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5129   pxor(xtmp1, xtmp4);
5130 
5131   // Set detination lanes corresponding to unordered source lanes as zero.
5132   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5133   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5134 
5135   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5136   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5137   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5138 
5139   // Recompute the mask for remaining special value.
5140   pxor(xtmp2, xtmp3);
5141   // Extract mask corresponding to non-negative source lanes.
5142   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5143 
5144   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5145   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5146   pand(xtmp3, xtmp2);
5147 
5148   // Replace destination lanes holding special value(0x80000000) with max int
5149   // if corresponding source lane holds a +ve value.
5150   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5151   bind(done);
5152 }
5153 
5154 
5155 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5156                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5157   switch(to_elem_bt) {
5158     case T_SHORT:
5159       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5160       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5161       vpackusdw(dst, dst, zero, vec_enc);
5162       if (vec_enc == Assembler::AVX_256bit) {
5163         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5164       }
5165       break;
5166     case  T_BYTE:
5167       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5168       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5169       vpackusdw(dst, dst, zero, vec_enc);
5170       if (vec_enc == Assembler::AVX_256bit) {
5171         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5172       }
5173       vpackuswb(dst, dst, zero, vec_enc);
5174       break;
5175     default: assert(false, "Unexpected basic type for target of vector cast int to subword: %s", type2name(to_elem_bt));
5176   }
5177 }
5178 
5179 /*
5180  * Algorithm for vector D2L and F2I conversions (AVX 10.2 unsupported):-
5181  * a) Perform vector D2L/F2I cast.
5182  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5183  *    It signifies that source value could be any of the special floating point
5184  *    values(NaN,-Inf,Inf,Max,-Min).
5185  * c) Set destination to zero if source is NaN value.
5186  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5187  */
5188 
5189 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5190                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5191                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5192   int to_elem_sz = type2aelembytes(to_elem_bt);
5193   assert(to_elem_sz <= 4, "");
5194   vcvttps2dq(dst, src, vec_enc);
5195   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5196   if (to_elem_sz < 4) {
5197     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5198     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5199   }
5200 }
5201 
5202 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5203                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5204                                             Register rscratch, int vec_enc) {
5205   int to_elem_sz = type2aelembytes(to_elem_bt);
5206   assert(to_elem_sz <= 4, "");
5207   vcvttps2dq(dst, src, vec_enc);
5208   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5209   switch(to_elem_bt) {
5210     case T_INT:
5211       break;
5212     case T_SHORT:
5213       evpmovdw(dst, dst, vec_enc);
5214       break;
5215     case T_BYTE:
5216       evpmovdb(dst, dst, vec_enc);
5217       break;
5218     default: assert(false, "Unexpected basic type for target of vector castF2X EVEX: %s", type2name(to_elem_bt));
5219   }
5220 }
5221 
5222 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5223                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5224                                             Register rscratch, int vec_enc) {
5225   evcvttps2qq(dst, src, vec_enc);
5226   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5227 }
5228 
5229 // Handling for downcasting from double to integer or sub-word types on AVX2.
5230 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5231                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5232                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5233   int to_elem_sz = type2aelembytes(to_elem_bt);
5234   assert(to_elem_sz < 8, "");
5235   vcvttpd2dq(dst, src, vec_enc);
5236   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5237                                               float_sign_flip, vec_enc);
5238   if (to_elem_sz < 4) {
5239     // xtmp4 holds all zero lanes.
5240     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5241   }
5242 }
5243 
5244 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5245                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5246                                             KRegister ktmp2, AddressLiteral sign_flip,
5247                                             Register rscratch, int vec_enc) {
5248   if (VM_Version::supports_avx512dq()) {
5249     evcvttpd2qq(dst, src, vec_enc);
5250     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5251     switch(to_elem_bt) {
5252       case T_LONG:
5253         break;
5254       case T_INT:
5255         evpmovsqd(dst, dst, vec_enc);
5256         break;
5257       case T_SHORT:
5258         evpmovsqd(dst, dst, vec_enc);
5259         evpmovdw(dst, dst, vec_enc);
5260         break;
5261       case T_BYTE:
5262         evpmovsqd(dst, dst, vec_enc);
5263         evpmovdb(dst, dst, vec_enc);
5264         break;
5265       default: assert(false, "Unexpected basic type for target of vector castD2X AVX512DQ EVEX: %s", type2name(to_elem_bt));
5266     }
5267   } else {
5268     assert(type2aelembytes(to_elem_bt) <= 4, "");
5269     vcvttpd2dq(dst, src, vec_enc);
5270     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5271     switch(to_elem_bt) {
5272       case T_INT:
5273         break;
5274       case T_SHORT:
5275         evpmovdw(dst, dst, vec_enc);
5276         break;
5277       case T_BYTE:
5278         evpmovdb(dst, dst, vec_enc);
5279         break;
5280       default: assert(false, "Unexpected basic type for target of vector castD2X EVEX: %s", type2name(to_elem_bt));
5281     }
5282   }
5283 }
5284 
5285 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5286   switch(to_elem_bt) {
5287     case T_LONG:
5288       evcvttps2qqs(dst, src, vec_enc);
5289       break;
5290     case T_INT:
5291       evcvttps2dqs(dst, src, vec_enc);
5292       break;
5293     case T_SHORT:
5294       evcvttps2dqs(dst, src, vec_enc);
5295       evpmovdw(dst, dst, vec_enc);
5296       break;
5297     case T_BYTE:
5298       evcvttps2dqs(dst, src, vec_enc);
5299       evpmovdb(dst, dst, vec_enc);
5300       break;
5301     default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (reg src): %s", type2name(to_elem_bt));
5302   }
5303 }
5304 
5305 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5306   switch(to_elem_bt) {
5307     case T_LONG:
5308       evcvttps2qqs(dst, src, vec_enc);
5309       break;
5310     case T_INT:
5311       evcvttps2dqs(dst, src, vec_enc);
5312       break;
5313     case T_SHORT:
5314       evcvttps2dqs(dst, src, vec_enc);
5315       evpmovdw(dst, dst, vec_enc);
5316       break;
5317     case T_BYTE:
5318       evcvttps2dqs(dst, src, vec_enc);
5319       evpmovdb(dst, dst, vec_enc);
5320       break;
5321     default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (mem src): %s", type2name(to_elem_bt));
5322   }
5323 }
5324 
5325 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5326   switch(to_elem_bt) {
5327     case T_LONG:
5328       evcvttpd2qqs(dst, src, vec_enc);
5329       break;
5330     case T_INT:
5331       evcvttpd2dqs(dst, src, vec_enc);
5332       break;
5333     case T_SHORT:
5334       evcvttpd2dqs(dst, src, vec_enc);
5335       evpmovdw(dst, dst, vec_enc);
5336       break;
5337     case T_BYTE:
5338       evcvttpd2dqs(dst, src, vec_enc);
5339       evpmovdb(dst, dst, vec_enc);
5340       break;
5341     default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (reg src): %s", type2name(to_elem_bt));
5342   }
5343 }
5344 
5345 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5346   switch(to_elem_bt) {
5347     case T_LONG:
5348       evcvttpd2qqs(dst, src, vec_enc);
5349       break;
5350     case T_INT:
5351       evcvttpd2dqs(dst, src, vec_enc);
5352       break;
5353     case T_SHORT:
5354       evcvttpd2dqs(dst, src, vec_enc);
5355       evpmovdw(dst, dst, vec_enc);
5356       break;
5357     case T_BYTE:
5358       evcvttpd2dqs(dst, src, vec_enc);
5359       evpmovdb(dst, dst, vec_enc);
5360       break;
5361     default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (mem src): %s", type2name(to_elem_bt));
5362   }
5363 }
5364 
5365 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5366                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5367                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5368   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5369   // and re-instantiate original MXCSR.RC mode after that.
5370   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5371 
5372   mov64(tmp, julong_cast(0.5L));
5373   evpbroadcastq(xtmp1, tmp, vec_enc);
5374   vaddpd(xtmp1, src , xtmp1, vec_enc);
5375   evcvtpd2qq(dst, xtmp1, vec_enc);
5376   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5377                                                 double_sign_flip, vec_enc);;
5378 
5379   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5380 }
5381 
5382 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5383                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5384                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5385   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5386   // and re-instantiate original MXCSR.RC mode after that.
5387   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5388 
5389   movl(tmp, jint_cast(0.5));
5390   movq(xtmp1, tmp);
5391   vbroadcastss(xtmp1, xtmp1, vec_enc);
5392   vaddps(xtmp1, src , xtmp1, vec_enc);
5393   vcvtps2dq(dst, xtmp1, vec_enc);
5394   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5395                                               float_sign_flip, vec_enc);
5396 
5397   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5398 }
5399 
5400 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5401                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5402                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5403   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5404   // and re-instantiate original MXCSR.RC mode after that.
5405   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5406 
5407   movl(tmp, jint_cast(0.5));
5408   movq(xtmp1, tmp);
5409   vbroadcastss(xtmp1, xtmp1, vec_enc);
5410   vaddps(xtmp1, src , xtmp1, vec_enc);
5411   vcvtps2dq(dst, xtmp1, vec_enc);
5412   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5413 
5414   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5415 }
5416 
5417 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5418                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5419   switch (from_elem_bt) {
5420     case T_BYTE:
5421       switch (to_elem_bt) {
5422         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5423         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5424         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5425         default: ShouldNotReachHere();
5426       }
5427       break;
5428     case T_SHORT:
5429       switch (to_elem_bt) {
5430         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5431         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5432         default: ShouldNotReachHere();
5433       }
5434       break;
5435     case T_INT:
5436       assert(to_elem_bt == T_LONG, "");
5437       vpmovzxdq(dst, src, vlen_enc);
5438       break;
5439     default:
5440       ShouldNotReachHere();
5441   }
5442 }
5443 
5444 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5445                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5446   switch (from_elem_bt) {
5447     case T_BYTE:
5448       switch (to_elem_bt) {
5449         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5450         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5451         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5452         default: ShouldNotReachHere();
5453       }
5454       break;
5455     case T_SHORT:
5456       switch (to_elem_bt) {
5457         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5458         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5459         default: ShouldNotReachHere();
5460       }
5461       break;
5462     case T_INT:
5463       assert(to_elem_bt == T_LONG, "");
5464       vpmovsxdq(dst, src, vlen_enc);
5465       break;
5466     default:
5467       ShouldNotReachHere();
5468   }
5469 }
5470 
5471 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5472                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5473   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5474   assert(vlen_enc != AVX_512bit, "");
5475 
5476   int dst_bt_size = type2aelembytes(dst_bt);
5477   int src_bt_size = type2aelembytes(src_bt);
5478   if (dst_bt_size > src_bt_size) {
5479     switch (dst_bt_size / src_bt_size) {
5480       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5481       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5482       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5483       default: ShouldNotReachHere();
5484     }
5485   } else {
5486     assert(dst_bt_size < src_bt_size, "");
5487     switch (src_bt_size / dst_bt_size) {
5488       case 2: {
5489         if (vlen_enc == AVX_128bit) {
5490           vpacksswb(dst, src, src, vlen_enc);
5491         } else {
5492           vpacksswb(dst, src, src, vlen_enc);
5493           vpermq(dst, dst, 0x08, vlen_enc);
5494         }
5495         break;
5496       }
5497       case 4: {
5498         if (vlen_enc == AVX_128bit) {
5499           vpackssdw(dst, src, src, vlen_enc);
5500           vpacksswb(dst, dst, dst, vlen_enc);
5501         } else {
5502           vpackssdw(dst, src, src, vlen_enc);
5503           vpermq(dst, dst, 0x08, vlen_enc);
5504           vpacksswb(dst, dst, dst, AVX_128bit);
5505         }
5506         break;
5507       }
5508       case 8: {
5509         if (vlen_enc == AVX_128bit) {
5510           vpshufd(dst, src, 0x08, vlen_enc);
5511           vpackssdw(dst, dst, dst, vlen_enc);
5512           vpacksswb(dst, dst, dst, vlen_enc);
5513         } else {
5514           vpshufd(dst, src, 0x08, vlen_enc);
5515           vpermq(dst, dst, 0x08, vlen_enc);
5516           vpackssdw(dst, dst, dst, AVX_128bit);
5517           vpacksswb(dst, dst, dst, AVX_128bit);
5518         }
5519         break;
5520       }
5521       default: ShouldNotReachHere();
5522     }
5523   }
5524 }
5525 
5526 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5527                                    bool merge, BasicType bt, int vlen_enc) {
5528   if (bt == T_INT) {
5529     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5530   } else {
5531     assert(bt == T_LONG, "");
5532     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5533   }
5534 }
5535 
5536 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5537                                    bool merge, BasicType bt, int vlen_enc) {
5538   if (bt == T_INT) {
5539     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5540   } else {
5541     assert(bt == T_LONG, "");
5542     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5543   }
5544 }
5545 
5546 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5547                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5548                                                int vec_enc) {
5549   int index = 0;
5550   int vindex = 0;
5551   mov64(rtmp1, 0x0101010101010101L);
5552   pdepq(rtmp1, src, rtmp1);
5553   if (mask_len > 8) {
5554     movq(rtmp2, src);
5555     vpxor(xtmp, xtmp, xtmp, vec_enc);
5556     movq(xtmp, rtmp1);
5557   }
5558   movq(dst, rtmp1);
5559 
5560   mask_len -= 8;
5561   while (mask_len > 0) {
5562     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5563     index++;
5564     if ((index % 2) == 0) {
5565       pxor(xtmp, xtmp);
5566     }
5567     mov64(rtmp1, 0x0101010101010101L);
5568     shrq(rtmp2, 8);
5569     pdepq(rtmp1, rtmp2, rtmp1);
5570     pinsrq(xtmp, rtmp1, index % 2);
5571     vindex = index / 2;
5572     if (vindex) {
5573       // Write entire 16 byte vector when both 64 bit
5574       // lanes are update to save redundant instructions.
5575       if (index % 2) {
5576         vinsertf128(dst, dst, xtmp, vindex);
5577       }
5578     } else {
5579       vmovdqu(dst, xtmp);
5580     }
5581     mask_len -= 8;
5582   }
5583 }
5584 
5585 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5586   switch(opc) {
5587     case Op_VectorMaskTrueCount:
5588       popcntq(dst, tmp);
5589       break;
5590     case Op_VectorMaskLastTrue:
5591       if (VM_Version::supports_lzcnt()) {
5592         lzcntq(tmp, tmp);
5593         movl(dst, 63);
5594         subl(dst, tmp);
5595       } else {
5596         movl(dst, -1);
5597         bsrq(tmp, tmp);
5598         cmov32(Assembler::notZero, dst, tmp);
5599       }
5600       break;
5601     case Op_VectorMaskFirstTrue:
5602       if (VM_Version::supports_bmi1()) {
5603         if (masklen < 32) {
5604           orl(tmp, 1 << masklen);
5605           tzcntl(dst, tmp);
5606         } else if (masklen == 32) {
5607           tzcntl(dst, tmp);
5608         } else {
5609           assert(masklen == 64, "");
5610           tzcntq(dst, tmp);
5611         }
5612       } else {
5613         if (masklen < 32) {
5614           orl(tmp, 1 << masklen);
5615           bsfl(dst, tmp);
5616         } else {
5617           assert(masklen == 32 || masklen == 64, "");
5618           movl(dst, masklen);
5619           if (masklen == 32)  {
5620             bsfl(tmp, tmp);
5621           } else {
5622             bsfq(tmp, tmp);
5623           }
5624           cmov32(Assembler::notZero, dst, tmp);
5625         }
5626       }
5627       break;
5628     case Op_VectorMaskToLong:
5629       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5630       break;
5631     default: assert(false, "Unhandled mask operation");
5632   }
5633 }
5634 
5635 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5636                                               int masklen, int masksize, int vec_enc) {
5637   assert(VM_Version::supports_popcnt(), "");
5638 
5639   if(VM_Version::supports_avx512bw()) {
5640     kmovql(tmp, mask);
5641   } else {
5642     assert(masklen <= 16, "");
5643     kmovwl(tmp, mask);
5644   }
5645 
5646   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5647   // operations needs to be clipped.
5648   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5649     andq(tmp, (1 << masklen) - 1);
5650   }
5651 
5652   vector_mask_operation_helper(opc, dst, tmp, masklen);
5653 }
5654 
5655 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5656                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5657   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5658          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5659   assert(VM_Version::supports_popcnt(), "");
5660 
5661   bool need_clip = false;
5662   switch(bt) {
5663     case T_BOOLEAN:
5664       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5665       vpxor(xtmp, xtmp, xtmp, vec_enc);
5666       vpsubb(xtmp, xtmp, mask, vec_enc);
5667       vpmovmskb(tmp, xtmp, vec_enc);
5668       need_clip = masklen < 16;
5669       break;
5670     case T_BYTE:
5671       vpmovmskb(tmp, mask, vec_enc);
5672       need_clip = masklen < 16;
5673       break;
5674     case T_SHORT:
5675       vpacksswb(xtmp, mask, mask, vec_enc);
5676       if (masklen >= 16) {
5677         vpermpd(xtmp, xtmp, 8, vec_enc);
5678       }
5679       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5680       need_clip = masklen < 16;
5681       break;
5682     case T_INT:
5683     case T_FLOAT:
5684       vmovmskps(tmp, mask, vec_enc);
5685       need_clip = masklen < 4;
5686       break;
5687     case T_LONG:
5688     case T_DOUBLE:
5689       vmovmskpd(tmp, mask, vec_enc);
5690       need_clip = masklen < 2;
5691       break;
5692     default: assert(false, "Unhandled type, %s", type2name(bt));
5693   }
5694 
5695   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5696   // operations needs to be clipped.
5697   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5698     // need_clip implies masklen < 32
5699     andq(tmp, (1 << masklen) - 1);
5700   }
5701 
5702   vector_mask_operation_helper(opc, dst, tmp, masklen);
5703 }
5704 
5705 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5706                                              Register rtmp2, int mask_len) {
5707   kmov(rtmp1, src);
5708   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5709   mov64(rtmp2, -1L);
5710   pextq(rtmp2, rtmp2, rtmp1);
5711   kmov(dst, rtmp2);
5712 }
5713 
5714 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5715                                                     XMMRegister mask, Register rtmp, Register rscratch,
5716                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5717                                                     int vec_enc) {
5718   assert(type2aelembytes(bt) >= 4, "");
5719   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5720   address compress_perm_table = nullptr;
5721   address expand_perm_table = nullptr;
5722   if (type2aelembytes(bt) == 8) {
5723     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5724     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5725     vmovmskpd(rtmp, mask, vec_enc);
5726   } else {
5727     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5728     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5729     vmovmskps(rtmp, mask, vec_enc);
5730   }
5731   shlq(rtmp, 5); // for 32 byte permute row.
5732   if (opcode == Op_CompressV) {
5733     lea(rscratch, ExternalAddress(compress_perm_table));
5734   } else {
5735     lea(rscratch, ExternalAddress(expand_perm_table));
5736   }
5737   addptr(rtmp, rscratch);
5738   vmovdqu(permv, Address(rtmp));
5739   vpermps(dst, permv, src, Assembler::AVX_256bit);
5740   vpxor(xtmp, xtmp, xtmp, vec_enc);
5741   // Blend the result with zero vector using permute mask, each column entry
5742   // in a permute table row contains either a valid permute index or a -1 (default)
5743   // value, this can potentially be used as a blending mask after
5744   // compressing/expanding the source vector lanes.
5745   vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv);
5746 }
5747 
5748 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5749                                                bool merge, BasicType bt, int vec_enc) {
5750   if (opcode == Op_CompressV) {
5751     switch(bt) {
5752     case T_BYTE:
5753       evpcompressb(dst, mask, src, merge, vec_enc);
5754       break;
5755     case T_CHAR:
5756     case T_SHORT:
5757       evpcompressw(dst, mask, src, merge, vec_enc);
5758       break;
5759     case T_INT:
5760       evpcompressd(dst, mask, src, merge, vec_enc);
5761       break;
5762     case T_FLOAT:
5763       evcompressps(dst, mask, src, merge, vec_enc);
5764       break;
5765     case T_LONG:
5766       evpcompressq(dst, mask, src, merge, vec_enc);
5767       break;
5768     case T_DOUBLE:
5769       evcompresspd(dst, mask, src, merge, vec_enc);
5770       break;
5771     default:
5772       fatal("Unsupported type %s", type2name(bt));
5773       break;
5774     }
5775   } else {
5776     assert(opcode == Op_ExpandV, "");
5777     switch(bt) {
5778     case T_BYTE:
5779       evpexpandb(dst, mask, src, merge, vec_enc);
5780       break;
5781     case T_CHAR:
5782     case T_SHORT:
5783       evpexpandw(dst, mask, src, merge, vec_enc);
5784       break;
5785     case T_INT:
5786       evpexpandd(dst, mask, src, merge, vec_enc);
5787       break;
5788     case T_FLOAT:
5789       evexpandps(dst, mask, src, merge, vec_enc);
5790       break;
5791     case T_LONG:
5792       evpexpandq(dst, mask, src, merge, vec_enc);
5793       break;
5794     case T_DOUBLE:
5795       evexpandpd(dst, mask, src, merge, vec_enc);
5796       break;
5797     default:
5798       fatal("Unsupported type %s", type2name(bt));
5799       break;
5800     }
5801   }
5802 }
5803 
5804 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5805                                            KRegister ktmp1, int vec_enc) {
5806   if (opcode == Op_SignumVD) {
5807     vsubpd(dst, zero, one, vec_enc);
5808     // if src < 0 ? -1 : 1
5809     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5810     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5811     // if src == NaN, -0.0 or 0.0 return src.
5812     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5813     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5814   } else {
5815     assert(opcode == Op_SignumVF, "");
5816     vsubps(dst, zero, one, vec_enc);
5817     // if src < 0 ? -1 : 1
5818     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5819     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5820     // if src == NaN, -0.0 or 0.0 return src.
5821     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5822     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5823   }
5824 }
5825 
5826 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5827                                           XMMRegister xtmp1, int vec_enc) {
5828   if (opcode == Op_SignumVD) {
5829     vsubpd(dst, zero, one, vec_enc);
5830     // if src < 0 ? -1 : 1
5831     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5832     // if src == NaN, -0.0 or 0.0 return src.
5833     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5834     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5835   } else {
5836     assert(opcode == Op_SignumVF, "");
5837     vsubps(dst, zero, one, vec_enc);
5838     // if src < 0 ? -1 : 1
5839     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5840     // if src == NaN, -0.0 or 0.0 return src.
5841     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5842     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5843   }
5844 }
5845 
5846 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5847   if (VM_Version::supports_avx512bw()) {
5848     if (mask_len > 32) {
5849       kmovql(dst, src);
5850     } else {
5851       kmovdl(dst, src);
5852       if (mask_len != 32) {
5853         kshiftrdl(dst, dst, 32 - mask_len);
5854       }
5855     }
5856   } else {
5857     assert(mask_len <= 16, "");
5858     kmovwl(dst, src);
5859     if (mask_len != 16) {
5860       kshiftrwl(dst, dst, 16 - mask_len);
5861     }
5862   }
5863 }
5864 
5865 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5866   int lane_size = type2aelembytes(bt);
5867   if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5868       (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) {
5869     movptr(rtmp, imm32);
5870     switch(lane_size) {
5871       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5872       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5873       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5874       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5875       fatal("Unsupported lane size %d", lane_size);
5876       break;
5877     }
5878   } else {
5879     movptr(rtmp, imm32);
5880     movq(dst, rtmp);
5881     switch(lane_size) {
5882       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5883       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5884       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5885       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5886       fatal("Unsupported lane size %d", lane_size);
5887       break;
5888     }
5889   }
5890 }
5891 
5892 //
5893 // Following is lookup table based popcount computation algorithm:-
5894 //       Index   Bit set count
5895 //     [ 0000 ->   0,
5896 //       0001 ->   1,
5897 //       0010 ->   1,
5898 //       0011 ->   2,
5899 //       0100 ->   1,
5900 //       0101 ->   2,
5901 //       0110 ->   2,
5902 //       0111 ->   3,
5903 //       1000 ->   1,
5904 //       1001 ->   2,
5905 //       1010 ->   3,
5906 //       1011 ->   3,
5907 //       1100 ->   2,
5908 //       1101 ->   3,
5909 //       1111 ->   4 ]
5910 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5911 //     shuffle indices for lookup table access.
5912 //  b. Right shift each byte of vector lane by 4 positions.
5913 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5914 //     shuffle indices for lookup table access.
5915 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5916 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5917 //     count of all the bytes of a quadword.
5918 //  f. Perform step e. for upper 128bit vector lane.
5919 //  g. Pack the bitset count of quadwords back to double word.
5920 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5921 
5922 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5923                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5924   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5925   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5926   vpsrlw(dst, src, 4, vec_enc);
5927   vpand(dst, dst, xtmp1, vec_enc);
5928   vpand(xtmp1, src, xtmp1, vec_enc);
5929   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5930   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5931   vpshufb(dst, xtmp2, dst, vec_enc);
5932   vpaddb(dst, dst, xtmp1, vec_enc);
5933 }
5934 
5935 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5936                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5937   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5938   // Following code is as per steps e,f,g and h of above algorithm.
5939   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5940   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5941   vpsadbw(dst, dst, xtmp2, vec_enc);
5942   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5943   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5944   vpackuswb(dst, xtmp1, dst, vec_enc);
5945 }
5946 
5947 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5948                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5949   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5950   // Add the popcount of upper and lower bytes of word.
5951   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5952   vpsrlw(dst, xtmp1, 8, vec_enc);
5953   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5954   vpaddw(dst, dst, xtmp1, vec_enc);
5955 }
5956 
5957 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5958                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5959   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5960   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5961   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5962 }
5963 
5964 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5965                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5966   switch(bt) {
5967     case T_LONG:
5968       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5969       break;
5970     case T_INT:
5971       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5972       break;
5973     case T_CHAR:
5974     case T_SHORT:
5975       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5976       break;
5977     case T_BYTE:
5978     case T_BOOLEAN:
5979       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5980       break;
5981     default:
5982       fatal("Unsupported type %s", type2name(bt));
5983       break;
5984   }
5985 }
5986 
5987 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5988                                                       KRegister mask, bool merge, int vec_enc) {
5989   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5990   switch(bt) {
5991     case T_LONG:
5992       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5993       evpopcntq(dst, mask, src, merge, vec_enc);
5994       break;
5995     case T_INT:
5996       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5997       evpopcntd(dst, mask, src, merge, vec_enc);
5998       break;
5999     case T_CHAR:
6000     case T_SHORT:
6001       assert(VM_Version::supports_avx512_bitalg(), "");
6002       evpopcntw(dst, mask, src, merge, vec_enc);
6003       break;
6004     case T_BYTE:
6005     case T_BOOLEAN:
6006       assert(VM_Version::supports_avx512_bitalg(), "");
6007       evpopcntb(dst, mask, src, merge, vec_enc);
6008       break;
6009     default:
6010       fatal("Unsupported type %s", type2name(bt));
6011       break;
6012   }
6013 }
6014 
6015 // Bit reversal algorithm first reverses the bits of each byte followed by
6016 // a byte level reversal for multi-byte primitive types (short/int/long).
6017 // Algorithm performs a lookup table access to get reverse bit sequence
6018 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
6019 // is obtained by swapping the reverse bit sequences of upper and lower
6020 // nibble of a byte.
6021 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6022                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
6023   if (VM_Version::supports_avx512vlbw()) {
6024 
6025     // Get the reverse bit sequence of lower nibble of each byte.
6026     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
6027     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6028     evpandq(dst, xtmp2, src, vec_enc);
6029     vpshufb(dst, xtmp1, dst, vec_enc);
6030     vpsllq(dst, dst, 4, vec_enc);
6031 
6032     // Get the reverse bit sequence of upper nibble of each byte.
6033     vpandn(xtmp2, xtmp2, src, vec_enc);
6034     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6035     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6036 
6037     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6038     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6039     evporq(xtmp2, dst, xtmp2, vec_enc);
6040     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6041 
6042   } else if(vec_enc == Assembler::AVX_512bit) {
6043     // Shift based bit reversal.
6044     assert(bt == T_LONG || bt == T_INT, "");
6045 
6046     // Swap lower and upper nibble of each byte.
6047     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
6048 
6049     // Swap two least and most significant bits of each nibble.
6050     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
6051 
6052     // Swap adjacent pair of bits.
6053     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6054     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
6055 
6056     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6057     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
6058   } else {
6059     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
6060     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6061 
6062     // Get the reverse bit sequence of lower nibble of each byte.
6063     vpand(dst, xtmp2, src, vec_enc);
6064     vpshufb(dst, xtmp1, dst, vec_enc);
6065     vpsllq(dst, dst, 4, vec_enc);
6066 
6067     // Get the reverse bit sequence of upper nibble of each byte.
6068     vpandn(xtmp2, xtmp2, src, vec_enc);
6069     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6070     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6071 
6072     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6073     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6074     vpor(xtmp2, dst, xtmp2, vec_enc);
6075     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6076   }
6077 }
6078 
6079 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
6080                                                 XMMRegister xtmp, Register rscratch) {
6081   assert(VM_Version::supports_gfni(), "");
6082   assert(rscratch != noreg || always_reachable(mask), "missing");
6083 
6084   // Galois field instruction based bit reversal based on following algorithm.
6085   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6086   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
6087   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
6088   vector_reverse_byte(bt, dst, xtmp, vec_enc);
6089 }
6090 
6091 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
6092                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
6093   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
6094   evpandq(dst, xtmp1, src, vec_enc);
6095   vpsllq(dst, dst, nbits, vec_enc);
6096   vpandn(xtmp1, xtmp1, src, vec_enc);
6097   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
6098   evporq(dst, dst, xtmp1, vec_enc);
6099 }
6100 
6101 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6102                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
6103   // Shift based bit reversal.
6104   assert(VM_Version::supports_evex(), "");
6105   switch(bt) {
6106     case T_LONG:
6107       // Swap upper and lower double word of each quad word.
6108       evprorq(xtmp1, k0, src, 32, true, vec_enc);
6109       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
6110       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6111       break;
6112     case T_INT:
6113       // Swap upper and lower word of each double word.
6114       evprord(xtmp1, k0, src, 16, true, vec_enc);
6115       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6116       break;
6117     case T_CHAR:
6118     case T_SHORT:
6119       // Swap upper and lower byte of each word.
6120       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6121       break;
6122     case T_BYTE:
6123       evmovdquq(dst, k0, src, true, vec_enc);
6124       break;
6125     default:
6126       fatal("Unsupported type %s", type2name(bt));
6127       break;
6128   }
6129 }
6130 
6131 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6132   if (bt == T_BYTE) {
6133     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6134       evmovdquq(dst, k0, src, true, vec_enc);
6135     } else {
6136       vmovdqu(dst, src);
6137     }
6138     return;
6139   }
6140   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6141   // pre-computed shuffle indices.
6142   switch(bt) {
6143     case T_LONG:
6144       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6145       break;
6146     case T_INT:
6147       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6148       break;
6149     case T_CHAR:
6150     case T_SHORT:
6151       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6152       break;
6153     default:
6154       fatal("Unsupported type %s", type2name(bt));
6155       break;
6156   }
6157   vpshufb(dst, src, dst, vec_enc);
6158 }
6159 
6160 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6161                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6162                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6163   assert(is_integral_type(bt), "");
6164   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6165   assert(VM_Version::supports_avx512cd(), "");
6166   switch(bt) {
6167     case T_LONG:
6168       evplzcntq(dst, ktmp, src, merge, vec_enc);
6169       break;
6170     case T_INT:
6171       evplzcntd(dst, ktmp, src, merge, vec_enc);
6172       break;
6173     case T_SHORT:
6174       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6175       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6176       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6177       vpunpckhwd(dst, xtmp1, src, vec_enc);
6178       evplzcntd(dst, ktmp, dst, merge, vec_enc);
6179       vpackusdw(dst, xtmp2, dst, vec_enc);
6180       break;
6181     case T_BYTE:
6182       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6183       // accessing the lookup table.
6184       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6185       // accessing the lookup table.
6186       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6187       assert(VM_Version::supports_avx512bw(), "");
6188       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6189       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6190       vpand(xtmp2, dst, src, vec_enc);
6191       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6192       vpsrlw(xtmp3, src, 4, vec_enc);
6193       vpand(xtmp3, dst, xtmp3, vec_enc);
6194       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6195       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6196       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6197       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6198       break;
6199     default:
6200       fatal("Unsupported type %s", type2name(bt));
6201       break;
6202   }
6203 }
6204 
6205 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6206                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6207   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6208   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6209   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6210   // accessing the lookup table.
6211   vpand(dst, xtmp2, src, vec_enc);
6212   vpshufb(dst, xtmp1, dst, vec_enc);
6213   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6214   // accessing the lookup table.
6215   vpsrlw(xtmp3, src, 4, vec_enc);
6216   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6217   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6218   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6219   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6220   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6221   vpaddb(dst, dst, xtmp2, vec_enc);
6222   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6223 }
6224 
6225 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6226                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6227   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6228   // Add zero counts of lower byte and upper byte of a word if
6229   // upper byte holds a zero value.
6230   vpsrlw(xtmp3, src, 8, vec_enc);
6231   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6232   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6233   vpsllw(xtmp2, dst, 8, vec_enc);
6234   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6235   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6236   vpsrlw(dst, dst, 8, vec_enc);
6237 }
6238 
6239 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6240                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6241   // By converting the integer to a float, we can obtain the number of leading zeros based on the exponent of the float.
6242   // As the float exponent contains a bias of 127 for nonzero values, the bias must be removed before interpreting the
6243   // exponent as the leading zero count.
6244 
6245   // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher
6246   // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit
6247   // contributes to the leading number of zeros.
6248   vpsrld(dst, src, 1, vec_enc);
6249   vpandn(dst, dst, src, vec_enc);
6250 
6251   vcvtdq2ps(dst, dst, vec_enc);
6252 
6253   // By comparing the register to itself, all the bits in the destination are set.
6254   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6255 
6256   // Move the biased exponent to the low end of the lane and mask with 0xFF to discard the sign bit.
6257   vpsrld(xtmp2, xtmp1, 24, vec_enc);
6258   vpsrld(dst, dst, 23, vec_enc);
6259   vpand(dst, xtmp2, dst, vec_enc);
6260 
6261   // Subtract 127 from the exponent, which removes the bias from the exponent.
6262   vpsrld(xtmp2, xtmp1, 25, vec_enc);
6263   vpsubd(dst, dst, xtmp2, vec_enc);
6264 
6265   vpsrld(xtmp2, xtmp1, 27, vec_enc);
6266 
6267   // If the original value is 0 the exponent would not have bias, so the subtraction creates a negative number. If this
6268   // is found in any of the lanes, replace the lane with -1 from xtmp1.
6269   vblendvps(dst, dst, xtmp1, dst, vec_enc, true, xtmp3);
6270 
6271   // If the original value is negative, replace the lane with 31.
6272   vblendvps(dst, dst, xtmp2, src, vec_enc, true, xtmp3);
6273 
6274   // Subtract the exponent from 31, giving the final result. For 0, the result is 32 as the exponent was replaced with -1,
6275   // and for negative numbers the result is 0 as the exponent was replaced with 31.
6276   vpsubd(dst, xtmp2, dst, vec_enc);
6277 }
6278 
6279 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6280                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6281   // Find the leading zeros of the top and bottom halves of the long individually.
6282   vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6283 
6284   // Move the top half result to the bottom half of xtmp1, setting the top half to 0.
6285   vpsrlq(xtmp1, dst, 32, vec_enc);
6286   // By moving the top half result to the right by 6 bits, if the top half was empty (i.e. 32 is returned) the result bit will
6287   // be in the most significant position of the bottom half.
6288   vpsrlq(xtmp2, dst, 6, vec_enc);
6289 
6290   // In the bottom half, add the top half and bottom half results.
6291   vpaddq(dst, xtmp1, dst, vec_enc);
6292 
6293   // For the bottom half, choose between the values using the most significant bit of xtmp2.
6294   // If the MSB is set, then bottom+top in dst is the resulting value. If the top half is less than 32 xtmp1 is chosen,
6295   // which contains only the top half result.
6296   // In the top half the MSB is always zero, so the value in xtmp1 is always chosen. This value is always 0, which clears
6297   // the lane as required.
6298   vblendvps(dst, xtmp1, dst, xtmp2, vec_enc, true, xtmp3);
6299 }
6300 
6301 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6302                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6303                                                        Register rtmp, int vec_enc) {
6304   assert(is_integral_type(bt), "unexpected type");
6305   assert(vec_enc < Assembler::AVX_512bit, "");
6306   switch(bt) {
6307     case T_LONG:
6308       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6309       break;
6310     case T_INT:
6311       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6312       break;
6313     case T_SHORT:
6314       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6315       break;
6316     case T_BYTE:
6317       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6318       break;
6319     default:
6320       fatal("Unsupported type %s", type2name(bt));
6321       break;
6322   }
6323 }
6324 
6325 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6326   switch(bt) {
6327     case T_BYTE:
6328       vpsubb(dst, src1, src2, vec_enc);
6329       break;
6330     case T_SHORT:
6331       vpsubw(dst, src1, src2, vec_enc);
6332       break;
6333     case T_INT:
6334       vpsubd(dst, src1, src2, vec_enc);
6335       break;
6336     case T_LONG:
6337       vpsubq(dst, src1, src2, vec_enc);
6338       break;
6339     default:
6340       fatal("Unsupported type %s", type2name(bt));
6341       break;
6342   }
6343 }
6344 
6345 // Trailing zero count computation is based on leading zero count operation as per
6346 // following equation. All AVX3 targets support AVX512CD feature which offers
6347 // direct vector instruction to compute leading zero count.
6348 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6349 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6350                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6351                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6352   assert(is_integral_type(bt), "");
6353   // xtmp = -1
6354   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6355   // xtmp = xtmp + src
6356   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6357   // xtmp = xtmp & ~src
6358   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6359   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6360   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6361   vpsub(bt, dst, xtmp4, dst, vec_enc);
6362 }
6363 
6364 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6365 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6366 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6367                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6368   assert(is_integral_type(bt), "");
6369   // xtmp = 0
6370   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6371   // xtmp = 0 - src
6372   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6373   // xtmp = xtmp | src
6374   vpor(xtmp3, xtmp3, src, vec_enc);
6375   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6376   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6377   vpsub(bt, dst, xtmp1, dst, vec_enc);
6378 }
6379 
6380 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6381   Label done;
6382   Label neg_divisor_fastpath;
6383   cmpl(divisor, 0);
6384   jccb(Assembler::less, neg_divisor_fastpath);
6385   xorl(rdx, rdx);
6386   divl(divisor);
6387   jmpb(done);
6388   bind(neg_divisor_fastpath);
6389   // Fastpath for divisor < 0:
6390   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6391   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6392   movl(rdx, rax);
6393   subl(rdx, divisor);
6394   if (VM_Version::supports_bmi1()) {
6395     andnl(rax, rdx, rax);
6396   } else {
6397     notl(rdx);
6398     andl(rax, rdx);
6399   }
6400   shrl(rax, 31);
6401   bind(done);
6402 }
6403 
6404 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6405   Label done;
6406   Label neg_divisor_fastpath;
6407   cmpl(divisor, 0);
6408   jccb(Assembler::less, neg_divisor_fastpath);
6409   xorl(rdx, rdx);
6410   divl(divisor);
6411   jmpb(done);
6412   bind(neg_divisor_fastpath);
6413   // Fastpath when divisor < 0:
6414   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6415   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6416   movl(rdx, rax);
6417   subl(rax, divisor);
6418   if (VM_Version::supports_bmi1()) {
6419     andnl(rax, rax, rdx);
6420   } else {
6421     notl(rax);
6422     andl(rax, rdx);
6423   }
6424   sarl(rax, 31);
6425   andl(rax, divisor);
6426   subl(rdx, rax);
6427   bind(done);
6428 }
6429 
6430 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6431   Label done;
6432   Label neg_divisor_fastpath;
6433 
6434   cmpl(divisor, 0);
6435   jccb(Assembler::less, neg_divisor_fastpath);
6436   xorl(rdx, rdx);
6437   divl(divisor);
6438   jmpb(done);
6439   bind(neg_divisor_fastpath);
6440   // Fastpath for divisor < 0:
6441   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6442   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6443   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6444   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6445   movl(rdx, rax);
6446   subl(rax, divisor);
6447   if (VM_Version::supports_bmi1()) {
6448     andnl(rax, rax, rdx);
6449   } else {
6450     notl(rax);
6451     andl(rax, rdx);
6452   }
6453   movl(tmp, rax);
6454   shrl(rax, 31); // quotient
6455   sarl(tmp, 31);
6456   andl(tmp, divisor);
6457   subl(rdx, tmp); // remainder
6458   bind(done);
6459 }
6460 
6461 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6462                                  XMMRegister xtmp2, Register rtmp) {
6463   if(VM_Version::supports_gfni()) {
6464     // Galois field instruction based bit reversal based on following algorithm.
6465     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6466     mov64(rtmp, 0x8040201008040201L);
6467     movq(xtmp1, src);
6468     movq(xtmp2, rtmp);
6469     gf2p8affineqb(xtmp1, xtmp2, 0);
6470     movq(dst, xtmp1);
6471   } else {
6472     // Swap even and odd numbered bits.
6473     movl(rtmp, src);
6474     andl(rtmp, 0x55555555);
6475     shll(rtmp, 1);
6476     movl(dst, src);
6477     andl(dst, 0xAAAAAAAA);
6478     shrl(dst, 1);
6479     orl(dst, rtmp);
6480 
6481     // Swap LSB and MSB 2 bits of each nibble.
6482     movl(rtmp, dst);
6483     andl(rtmp, 0x33333333);
6484     shll(rtmp, 2);
6485     andl(dst, 0xCCCCCCCC);
6486     shrl(dst, 2);
6487     orl(dst, rtmp);
6488 
6489     // Swap LSB and MSB 4 bits of each byte.
6490     movl(rtmp, dst);
6491     andl(rtmp, 0x0F0F0F0F);
6492     shll(rtmp, 4);
6493     andl(dst, 0xF0F0F0F0);
6494     shrl(dst, 4);
6495     orl(dst, rtmp);
6496   }
6497   bswapl(dst);
6498 }
6499 
6500 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6501                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6502   if(VM_Version::supports_gfni()) {
6503     // Galois field instruction based bit reversal based on following algorithm.
6504     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6505     mov64(rtmp1, 0x8040201008040201L);
6506     movq(xtmp1, src);
6507     movq(xtmp2, rtmp1);
6508     gf2p8affineqb(xtmp1, xtmp2, 0);
6509     movq(dst, xtmp1);
6510   } else {
6511     // Swap even and odd numbered bits.
6512     movq(rtmp1, src);
6513     mov64(rtmp2, 0x5555555555555555L);
6514     andq(rtmp1, rtmp2);
6515     shlq(rtmp1, 1);
6516     movq(dst, src);
6517     notq(rtmp2);
6518     andq(dst, rtmp2);
6519     shrq(dst, 1);
6520     orq(dst, rtmp1);
6521 
6522     // Swap LSB and MSB 2 bits of each nibble.
6523     movq(rtmp1, dst);
6524     mov64(rtmp2, 0x3333333333333333L);
6525     andq(rtmp1, rtmp2);
6526     shlq(rtmp1, 2);
6527     notq(rtmp2);
6528     andq(dst, rtmp2);
6529     shrq(dst, 2);
6530     orq(dst, rtmp1);
6531 
6532     // Swap LSB and MSB 4 bits of each byte.
6533     movq(rtmp1, dst);
6534     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6535     andq(rtmp1, rtmp2);
6536     shlq(rtmp1, 4);
6537     notq(rtmp2);
6538     andq(dst, rtmp2);
6539     shrq(dst, 4);
6540     orq(dst, rtmp1);
6541   }
6542   bswapq(dst);
6543 }
6544 
6545 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6546   Label done;
6547   Label neg_divisor_fastpath;
6548   cmpq(divisor, 0);
6549   jccb(Assembler::less, neg_divisor_fastpath);
6550   xorl(rdx, rdx);
6551   divq(divisor);
6552   jmpb(done);
6553   bind(neg_divisor_fastpath);
6554   // Fastpath for divisor < 0:
6555   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6556   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6557   movq(rdx, rax);
6558   subq(rdx, divisor);
6559   if (VM_Version::supports_bmi1()) {
6560     andnq(rax, rdx, rax);
6561   } else {
6562     notq(rdx);
6563     andq(rax, rdx);
6564   }
6565   shrq(rax, 63);
6566   bind(done);
6567 }
6568 
6569 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6570   Label done;
6571   Label neg_divisor_fastpath;
6572   cmpq(divisor, 0);
6573   jccb(Assembler::less, neg_divisor_fastpath);
6574   xorq(rdx, rdx);
6575   divq(divisor);
6576   jmp(done);
6577   bind(neg_divisor_fastpath);
6578   // Fastpath when divisor < 0:
6579   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6580   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6581   movq(rdx, rax);
6582   subq(rax, divisor);
6583   if (VM_Version::supports_bmi1()) {
6584     andnq(rax, rax, rdx);
6585   } else {
6586     notq(rax);
6587     andq(rax, rdx);
6588   }
6589   sarq(rax, 63);
6590   andq(rax, divisor);
6591   subq(rdx, rax);
6592   bind(done);
6593 }
6594 
6595 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6596   Label done;
6597   Label neg_divisor_fastpath;
6598   cmpq(divisor, 0);
6599   jccb(Assembler::less, neg_divisor_fastpath);
6600   xorq(rdx, rdx);
6601   divq(divisor);
6602   jmp(done);
6603   bind(neg_divisor_fastpath);
6604   // Fastpath for divisor < 0:
6605   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6606   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6607   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6608   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6609   movq(rdx, rax);
6610   subq(rax, divisor);
6611   if (VM_Version::supports_bmi1()) {
6612     andnq(rax, rax, rdx);
6613   } else {
6614     notq(rax);
6615     andq(rax, rdx);
6616   }
6617   movq(tmp, rax);
6618   shrq(rax, 63); // quotient
6619   sarq(tmp, 63);
6620   andq(tmp, divisor);
6621   subq(rdx, tmp); // remainder
6622   bind(done);
6623 }
6624 
6625 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6626                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6627                                         int vlen_enc) {
6628   assert(VM_Version::supports_avx512bw(), "");
6629   // Byte shuffles are inlane operations and indices are determined using
6630   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6631   // normalized to index range 0-15. This makes sure that all the multiples
6632   // of an index value are placed at same relative position in 128 bit
6633   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6634   // will be 16th element in their respective 128 bit lanes.
6635   movl(rtmp, 16);
6636   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6637 
6638   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6639   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6640   // original shuffle indices and move the shuffled lanes corresponding to true
6641   // mask to destination vector.
6642   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6643   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6644   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6645 
6646   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6647   // and broadcasting second 128 bit lane.
6648   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6649   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6650   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6651   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6652   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6653 
6654   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6655   // and broadcasting third 128 bit lane.
6656   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6657   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6658   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6659   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6660   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6661 
6662   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6663   // and broadcasting third 128 bit lane.
6664   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6665   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6666   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6667   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6668   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6669 }
6670 
6671 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6672                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6673   if (vlen_enc == AVX_128bit) {
6674     vpermilps(dst, src, shuffle, vlen_enc);
6675   } else if (bt == T_INT) {
6676     vpermd(dst, shuffle, src, vlen_enc);
6677   } else {
6678     assert(bt == T_FLOAT, "");
6679     vpermps(dst, shuffle, src, vlen_enc);
6680   }
6681 }
6682 
6683 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6684   switch(opcode) {
6685     case Op_AddHF: vaddsh(dst, src1, src2); break;
6686     case Op_SubHF: vsubsh(dst, src1, src2); break;
6687     case Op_MulHF: vmulsh(dst, src1, src2); break;
6688     case Op_DivHF: vdivsh(dst, src1, src2); break;
6689     default: assert(false, "%s", NodeClassNames[opcode]); break;
6690   }
6691 }
6692 
6693 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6694   switch(elem_bt) {
6695     case T_BYTE:
6696       if (ideal_opc == Op_SaturatingAddV) {
6697         vpaddsb(dst, src1, src2, vlen_enc);
6698       } else {
6699         assert(ideal_opc == Op_SaturatingSubV, "");
6700         vpsubsb(dst, src1, src2, vlen_enc);
6701       }
6702       break;
6703     case T_SHORT:
6704       if (ideal_opc == Op_SaturatingAddV) {
6705         vpaddsw(dst, src1, src2, vlen_enc);
6706       } else {
6707         assert(ideal_opc == Op_SaturatingSubV, "");
6708         vpsubsw(dst, src1, src2, vlen_enc);
6709       }
6710       break;
6711     default:
6712       fatal("Unsupported type %s", type2name(elem_bt));
6713       break;
6714   }
6715 }
6716 
6717 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6718   switch(elem_bt) {
6719     case T_BYTE:
6720       if (ideal_opc == Op_SaturatingAddV) {
6721         vpaddusb(dst, src1, src2, vlen_enc);
6722       } else {
6723         assert(ideal_opc == Op_SaturatingSubV, "");
6724         vpsubusb(dst, src1, src2, vlen_enc);
6725       }
6726       break;
6727     case T_SHORT:
6728       if (ideal_opc == Op_SaturatingAddV) {
6729         vpaddusw(dst, src1, src2, vlen_enc);
6730       } else {
6731         assert(ideal_opc == Op_SaturatingSubV, "");
6732         vpsubusw(dst, src1, src2, vlen_enc);
6733       }
6734       break;
6735     default:
6736       fatal("Unsupported type %s", type2name(elem_bt));
6737       break;
6738   }
6739 }
6740 
6741 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6742                                                               XMMRegister src2, KRegister ktmp, int vlen_enc) {
6743   // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6744   // overflow_mask = Inp1 <u Inp2
6745   evpcmpu(elem_bt, ktmp,  src2, src1, Assembler::lt, vlen_enc);
6746   // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6747   evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6748 }
6749 
6750 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6751                                                               XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6752   // Emulate unsigned comparison using signed comparison
6753   // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6754   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6755   vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6756   vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6757 
6758   vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6759 
6760   // Res = INP1 - INP2 (non-commutative and non-associative)
6761   vpsub(elem_bt, dst, src1, src2, vlen_enc);
6762   // Res = Mask ? Zero : Res
6763   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6764   vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6765 }
6766 
6767 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6768                                                                XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6769   // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6770   // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6771   // Res = Signed Add INP1, INP2
6772   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6773   // T1 = SRC1 | SRC2
6774   vpor(xtmp1, src1, src2, vlen_enc);
6775   // Max_Unsigned = -1
6776   vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6777   // Unsigned compare:  Mask = Res <u T1
6778   evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6779   // res  = Mask ? Max_Unsigned : Res
6780   evpblend(elem_bt, dst, ktmp,  dst, xtmp2, true, vlen_enc);
6781 }
6782 
6783 //
6784 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6785 // unsigned addition operation.
6786 //    overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6787 //
6788 // We empirically determined its semantic equivalence to following reduced expression
6789 //    overflow_mask =  (a + b) <u (a | b)
6790 //
6791 // and also verified it though Alive2 solver.
6792 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6793 //
6794 
6795 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6796                                                               XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6797   // Res = Signed Add INP1, INP2
6798   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6799   // Compute T1 = INP1 | INP2
6800   vpor(xtmp3, src1, src2, vlen_enc);
6801   // T1 = Minimum signed value.
6802   vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6803   // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6804   vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6805   // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6806   vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6807   // Compute overflow detection mask = Res<1> <s T1
6808   if (elem_bt == T_INT) {
6809     vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6810   } else {
6811     assert(elem_bt == T_LONG, "");
6812     vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6813   }
6814   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6815 }
6816 
6817 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6818                                       int vlen_enc, bool xtmp2_hold_M1) {
6819   if (VM_Version::supports_avx512dq()) {
6820     evpmovq2m(ktmp, src, vlen_enc);
6821   } else {
6822     assert(VM_Version::supports_evex(), "");
6823     if (!xtmp2_hold_M1) {
6824       vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6825     }
6826     evpsraq(xtmp1, src, 63, vlen_enc);
6827     evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6828   }
6829 }
6830 
6831 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6832                                       int vlen_enc, bool xtmp2_hold_M1) {
6833   if (VM_Version::supports_avx512dq()) {
6834     evpmovd2m(ktmp, src, vlen_enc);
6835   } else {
6836     assert(VM_Version::supports_evex(), "");
6837     if (!xtmp2_hold_M1) {
6838       vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6839     }
6840     vpsrad(xtmp1, src, 31, vlen_enc);
6841     Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6842   }
6843 }
6844 
6845 
6846 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6847   if (elem_bt == T_LONG) {
6848     if (VM_Version::supports_evex()) {
6849       evpsraq(dst, src, 63, vlen_enc);
6850     } else {
6851       vpsrad(dst, src, 31, vlen_enc);
6852       vpshufd(dst, dst, 0xF5, vlen_enc);
6853     }
6854   } else {
6855     assert(elem_bt == T_INT, "");
6856     vpsrad(dst, src, 31, vlen_enc);
6857   }
6858 }
6859 
6860 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6861   if (compute_allones) {
6862     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6863       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6864     } else {
6865       vpcmpeqq(allones, allones, allones, vlen_enc);
6866     }
6867   }
6868   if (elem_bt == T_LONG) {
6869     vpsrlq(dst, allones, 1, vlen_enc);
6870   } else {
6871     assert(elem_bt == T_INT, "");
6872     vpsrld(dst, allones, 1, vlen_enc);
6873   }
6874 }
6875 
6876 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6877   if (compute_allones) {
6878     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6879       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6880     } else {
6881       vpcmpeqq(allones, allones, allones, vlen_enc);
6882     }
6883   }
6884   if (elem_bt == T_LONG) {
6885     vpsllq(dst, allones, 63, vlen_enc);
6886   } else {
6887     assert(elem_bt == T_INT, "");
6888     vpslld(dst, allones, 31, vlen_enc);
6889   }
6890 }
6891 
6892 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask,  XMMRegister src1, XMMRegister src2,
6893                                 Assembler::ComparisonPredicate cond, int vlen_enc) {
6894   switch(elem_bt) {
6895     case T_LONG:  evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6896     case T_INT:   evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6897     case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6898     case T_BYTE:  evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6899     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6900   }
6901 }
6902 
6903 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6904   switch(elem_bt) {
6905     case  T_LONG:  vpcmpgtq(dst, src1, src2, vlen_enc); break;
6906     case  T_INT:   vpcmpgtd(dst, src1, src2, vlen_enc); break;
6907     case  T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6908     case  T_BYTE:  vpcmpgtb(dst, src1, src2, vlen_enc); break;
6909     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6910   }
6911 }
6912 
6913 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6914                                            XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6915   if (elem_bt == T_LONG) {
6916     evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6917   } else {
6918     assert(elem_bt == T_INT, "");
6919     evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6920   }
6921 }
6922 
6923 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6924                                                          XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6925                                                          KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6926   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6927   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6928   // Overflow detection based on Hacker's delight section 2-13.
6929   if (ideal_opc == Op_SaturatingAddV) {
6930     // res = src1 + src2
6931     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6932     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6933     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6934     vpxor(xtmp1, dst, src1, vlen_enc);
6935     vpxor(xtmp2, dst, src2, vlen_enc);
6936     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6937   } else {
6938     assert(ideal_opc == Op_SaturatingSubV, "");
6939     // res = src1 - src2
6940     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6941     // Overflow occurs when both inputs have opposite polarity and
6942     // result polarity does not comply with first input polarity.
6943     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6944     vpxor(xtmp1, src1, src2, vlen_enc);
6945     vpxor(xtmp2, dst, src1, vlen_enc);
6946     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6947   }
6948 
6949   // Compute overflow detection mask.
6950   evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6951   // Note: xtmp1 hold -1 in all its lanes after above call.
6952 
6953   // Compute mask based on first input polarity.
6954   evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6955 
6956   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6957   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6958 
6959   // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6960   // set bits in first input polarity mask holds a min value.
6961   evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6962   // Blend destination lanes with saturated values using overflow detection mask.
6963   evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6964 }
6965 
6966 
6967 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6968                                                         XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6969                                                         XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6970   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6971   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6972   // Overflow detection based on Hacker's delight section 2-13.
6973   if (ideal_opc == Op_SaturatingAddV) {
6974     // res = src1 + src2
6975     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6976     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6977     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6978     vpxor(xtmp1, dst, src1, vlen_enc);
6979     vpxor(xtmp2, dst, src2, vlen_enc);
6980     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6981   } else {
6982     assert(ideal_opc == Op_SaturatingSubV, "");
6983     // res = src1 - src2
6984     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6985     // Overflow occurs when both inputs have opposite polarity and
6986     // result polarity does not comply with first input polarity.
6987     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6988     vpxor(xtmp1, src1, src2, vlen_enc);
6989     vpxor(xtmp2, dst, src1, vlen_enc);
6990     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6991   }
6992 
6993   // Sign-extend to compute overflow detection mask.
6994   vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
6995 
6996   vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
6997   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
6998   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6999 
7000   // Compose saturating min/max vector using first input polarity mask.
7001   vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
7002   vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
7003 
7004   // Blend result with saturating vector using overflow detection mask.
7005   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
7006 }
7007 
7008 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7009   switch(elem_bt) {
7010     case T_BYTE:
7011       if (ideal_opc == Op_SaturatingAddV) {
7012         vpaddsb(dst, src1, src2, vlen_enc);
7013       } else {
7014         assert(ideal_opc == Op_SaturatingSubV, "");
7015         vpsubsb(dst, src1, src2, vlen_enc);
7016       }
7017       break;
7018     case T_SHORT:
7019       if (ideal_opc == Op_SaturatingAddV) {
7020         vpaddsw(dst, src1, src2, vlen_enc);
7021       } else {
7022         assert(ideal_opc == Op_SaturatingSubV, "");
7023         vpsubsw(dst, src1, src2, vlen_enc);
7024       }
7025       break;
7026     default:
7027       fatal("Unsupported type %s", type2name(elem_bt));
7028       break;
7029   }
7030 }
7031 
7032 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7033   switch(elem_bt) {
7034     case T_BYTE:
7035       if (ideal_opc == Op_SaturatingAddV) {
7036         vpaddusb(dst, src1, src2, vlen_enc);
7037       } else {
7038         assert(ideal_opc == Op_SaturatingSubV, "");
7039         vpsubusb(dst, src1, src2, vlen_enc);
7040       }
7041       break;
7042     case T_SHORT:
7043       if (ideal_opc == Op_SaturatingAddV) {
7044         vpaddusw(dst, src1, src2, vlen_enc);
7045       } else {
7046         assert(ideal_opc == Op_SaturatingSubV, "");
7047         vpsubusw(dst, src1, src2, vlen_enc);
7048       }
7049       break;
7050     default:
7051       fatal("Unsupported type %s", type2name(elem_bt));
7052       break;
7053   }
7054 }
7055 
7056 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
7057                                                      XMMRegister src2, int vlen_enc) {
7058   switch(elem_bt) {
7059     case T_BYTE:
7060       evpermi2b(dst, src1, src2, vlen_enc);
7061       break;
7062     case T_SHORT:
7063       evpermi2w(dst, src1, src2, vlen_enc);
7064       break;
7065     case T_INT:
7066       evpermi2d(dst, src1, src2, vlen_enc);
7067       break;
7068     case T_LONG:
7069       evpermi2q(dst, src1, src2, vlen_enc);
7070       break;
7071     case T_FLOAT:
7072       evpermi2ps(dst, src1, src2, vlen_enc);
7073       break;
7074     case T_DOUBLE:
7075       evpermi2pd(dst, src1, src2, vlen_enc);
7076       break;
7077     default:
7078       fatal("Unsupported type %s", type2name(elem_bt));
7079       break;
7080   }
7081 }
7082 
7083 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
7084   if (is_unsigned) {
7085     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7086   } else {
7087     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7088   }
7089 }
7090 
7091 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
7092   if (is_unsigned) {
7093     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7094   } else {
7095     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7096   }
7097 }
7098 
7099 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
7100   switch(opcode) {
7101     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7102     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7103     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7104     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7105     default: assert(false, "%s", NodeClassNames[opcode]); break;
7106   }
7107 }
7108 
7109 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7110   switch(opcode) {
7111     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7112     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7113     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7114     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7115     default: assert(false, "%s", NodeClassNames[opcode]); break;
7116   }
7117 }
7118 
7119 void C2_MacroAssembler::sminmax_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7120                                      KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) {
7121   vminmax_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit);
7122 }
7123 
7124 void C2_MacroAssembler::sminmax_fp16_avx10_2(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7125                                              KRegister ktmp) {
7126   if (opcode == Op_MaxHF) {
7127     // dst = max(src1, src2)
7128     evminmaxsh(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MAX_COMPARE_SIGN);
7129   } else {
7130     assert(opcode == Op_MinHF, "");
7131     // dst = min(src1, src2)
7132     evminmaxsh(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MIN_COMPARE_SIGN);
7133   }
7134 }
7135 
7136 void C2_MacroAssembler::vminmax_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7137                                      KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
7138   if (opcode == Op_MaxVHF || opcode == Op_MaxHF) {
7139     // Move sign bits of src2 to mask register.
7140     evpmovw2m(ktmp, src2, vlen_enc);
7141     // xtmp1 = src2 < 0 ? src2 : src1
7142     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7143     // xtmp2 = src2 < 0 ? ? src1 : src2
7144     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7145     // Idea behind above swapping is to make seconds source operand a +ve value.
7146     // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in
7147     // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction,
7148     // the second source operand, either a NaN or a valid floating-point value, is returned
7149     // dst = max(xtmp1, xtmp2)
7150     evmaxph(dst, xtmp1, xtmp2, vlen_enc);
7151     // isNaN = is_unordered_quiet(xtmp1)
7152     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7153     // Final result is same as first source if its a NaN value,
7154     // in case second operand holds a NaN value then as per above semantics
7155     // result is same as second operand.
7156     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7157   } else {
7158     assert(opcode == Op_MinVHF || opcode == Op_MinHF, "");
7159     // Move sign bits of src1 to mask register.
7160     evpmovw2m(ktmp, src1, vlen_enc);
7161     // xtmp1 = src1 < 0 ? src2 : src1
7162     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7163     // xtmp2 = src1 < 0 ? src1 : src2
7164     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7165     // Idea behind above swapping is to make seconds source operand a -ve value.
7166     // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in
7167     // the second source operand is returned.
7168     // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN
7169     // or a valid floating-point value, is written to the result.
7170     // dst = min(xtmp1, xtmp2)
7171     evminph(dst, xtmp1, xtmp2, vlen_enc);
7172     // isNaN = is_unordered_quiet(xtmp1)
7173     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7174     // Final result is same as first source if its a NaN value,
7175     // in case second operand holds a NaN value then as per above semantics
7176     // result is same as second operand.
7177     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7178   }
7179 }
7180 
7181 void C2_MacroAssembler::vminmax_fp16_avx10_2(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7182                                              KRegister ktmp, int vlen_enc) {
7183   if (opcode == Op_MaxVHF) {
7184     // dst = max(src1, src2)
7185     evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vlen_enc);
7186   } else {
7187     assert(opcode == Op_MinVHF, "");
7188     // dst = min(src1, src2)
7189     evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vlen_enc);
7190   }
7191 }
7192 
7193 void C2_MacroAssembler::vminmax_fp16_avx10_2(int opcode, XMMRegister dst, XMMRegister src1, Address src2,
7194                                              KRegister ktmp, int vlen_enc) {
7195   if (opcode == Op_MaxVHF) {
7196     // dst = max(src1, src2)
7197     evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vlen_enc);
7198   } else {
7199     assert(opcode == Op_MinVHF, "");
7200     // dst = min(src1, src2)
7201     evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vlen_enc);
7202   }
7203 }
7204 
7205 int C2_MacroAssembler::vector_iota_entry_index(BasicType bt) {
7206   // The vector iota entries array is ordered by type B/S/I/L/F/D, and
7207   // the offset between two types is 16.
7208   switch(bt) {
7209   case T_BYTE:
7210     return 0;
7211   case T_SHORT:
7212     return 1;
7213   case T_INT:
7214     return 2;
7215   case T_LONG:
7216     return 3;
7217   case T_FLOAT:
7218     return 4;
7219   case T_DOUBLE:
7220     return 5;
7221   default:
7222     ShouldNotReachHere();
7223   }
7224 }