1 /*
   2  * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "gc/shared/barrierSet.hpp"
  28 #include "gc/shared/barrierSetAssembler.hpp"
  29 #include "oops/methodData.hpp"
  30 #include "opto/c2_MacroAssembler.hpp"
  31 #include "opto/intrinsicnode.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/opcodes.hpp"
  34 #include "opto/subnode.hpp"
  35 #include "runtime/globals.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/objectMonitorTable.hpp"
  38 #include "runtime/stubRoutines.hpp"
  39 #include "runtime/synchronizer.hpp"
  40 #include "utilities/checkedCast.hpp"
  41 #include "utilities/globalDefinitions.hpp"
  42 #include "utilities/powerOfTwo.hpp"
  43 #include "utilities/sizes.hpp"
  44 
  45 #ifdef PRODUCT
  46 #define BLOCK_COMMENT(str) /* nothing */
  47 #define STOP(error) stop(error)
  48 #else
  49 #define BLOCK_COMMENT(str) block_comment(str)
  50 #define STOP(error) block_comment(error); stop(error)
  51 #endif
  52 
  53 // C2 compiled method's prolog code.
  54 // Beware! This sp_inc is NOT the same as the one mentioned in MacroAssembler::remove_frame but only the size
  55 // of the extension space + the additional copy of the return address. That means, it doesn't contain the
  56 // frame size (where the local and sp_inc are) and the saved RBP.
  57 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) {
  58   if (C->clinit_barrier_on_entry()) {
  59     assert(VM_Version::supports_fast_class_init_checks(), "sanity");
  60     assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started");
  61 
  62     Label L_skip_barrier;
  63     Register klass = rscratch1;
  64 
  65     mov_metadata(klass, C->method()->holder()->constant_encoding());
  66     clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
  67 
  68     jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
  69 
  70     bind(L_skip_barrier);
  71   }
  72 
  73   int framesize = C->output()->frame_size_in_bytes();
  74   int bangsize = C->output()->bang_size_in_bytes();
  75   bool fp_mode_24b = false;
  76   int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0;
  77 
  78   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  79 
  80   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  81   // Remove word for return addr
  82   framesize -= wordSize;
  83   stack_bang_size -= wordSize;
  84 
  85   // Calls to C2R adapters often do not accept exceptional returns.
  86   // We require that their callers must bang for them.  But be careful, because
  87   // some VM calls (such as call site linkage) can use several kilobytes of
  88   // stack.  But the stack safety zone should account for that.
  89   // See bugs 4446381, 4468289, 4497237.
  90   if (stack_bang_size > 0) {
  91     generate_stack_overflow_check(stack_bang_size);
  92 
  93     // We always push rbp, so that on return to interpreter rbp, will be
  94     // restored correctly and we can correct the stack.
  95     push(rbp);
  96 #ifdef ASSERT
  97     if (sp_inc > 0) {
  98       movl(Address(rsp, 0), badRegWordVal);
  99       movl(Address(rsp, VMRegImpl::stack_slot_size), badRegWordVal);
 100     }
 101 #endif
 102     // Save caller's stack pointer into RBP if the frame pointer is preserved.
 103     if (PreserveFramePointer) {
 104       mov(rbp, rsp);
 105     }
 106     // Remove word for ebp
 107     framesize -= wordSize;
 108 
 109     // Create frame
 110     if (framesize) {
 111       subptr(rsp, framesize);
 112     }
 113   } else {
 114     subptr(rsp, framesize);
 115 
 116     // Save RBP register now.
 117     framesize -= wordSize;
 118     movptr(Address(rsp, framesize), rbp);
 119 #ifdef ASSERT
 120     if (sp_inc > 0) {
 121       movl(Address(rsp, framesize), badRegWordVal);
 122       movl(Address(rsp, framesize + VMRegImpl::stack_slot_size), badRegWordVal);
 123     }
 124 #endif
 125     // Save caller's stack pointer into RBP if the frame pointer is preserved.
 126     if (PreserveFramePointer) {
 127       movptr(rbp, rsp);
 128       if (framesize > 0) {
 129         addptr(rbp, framesize);
 130       }
 131     }
 132   }
 133 
 134   if (C->needs_stack_repair()) {
 135     // Save stack increment just below the saved rbp (also account for fixed framesize and rbp)
 136     assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned");
 137     movptr(Address(rsp, framesize - wordSize), sp_inc + framesize);
 138   }
 139 
 140   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 141     framesize -= wordSize;
 142     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 143   }
 144 
 145 #ifdef ASSERT
 146   if (VerifyStackAtCalls) {
 147     Label L;
 148     push(rax);
 149     mov(rax, rsp);
 150     andptr(rax, StackAlignmentInBytes-1);
 151     cmpptr(rax, StackAlignmentInBytes-wordSize);
 152     pop(rax);
 153     jcc(Assembler::equal, L);
 154     STOP("Stack is not properly aligned!");
 155     bind(L);
 156   }
 157 #endif
 158 }
 159 
 160 void C2_MacroAssembler::entry_barrier() {
 161   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 162   // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 163   Label dummy_slow_path;
 164   Label dummy_continuation;
 165   Label* slow_path = &dummy_slow_path;
 166   Label* continuation = &dummy_continuation;
 167   if (!Compile::current()->output()->in_scratch_emit_size()) {
 168     // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 169     C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 170     Compile::current()->output()->add_stub(stub);
 171     slow_path = &stub->entry();
 172     continuation = &stub->continuation();
 173   }
 174   bs->nmethod_entry_barrier(this, slow_path, continuation);
 175 }
 176 
 177 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 178   switch (vlen_in_bytes) {
 179     case  4: // fall-through
 180     case  8: // fall-through
 181     case 16: return Assembler::AVX_128bit;
 182     case 32: return Assembler::AVX_256bit;
 183     case 64: return Assembler::AVX_512bit;
 184 
 185     default: {
 186       ShouldNotReachHere();
 187       return Assembler::AVX_NoVec;
 188     }
 189   }
 190 }
 191 
 192 // fast_lock and fast_unlock used by C2
 193 
 194 // Because the transitions from emitted code to the runtime
 195 // monitorenter/exit helper stubs are so slow it's critical that
 196 // we inline both the lock-stack fast path and the inflated fast path.
 197 //
 198 // See also: cmpFastLock and cmpFastUnlock.
 199 //
 200 // What follows is a specialized inline transliteration of the code
 201 // in enter() and exit(). If we're concerned about I$ bloat another
 202 // option would be to emit TrySlowEnter and TrySlowExit methods
 203 // at startup-time.  These methods would accept arguments as
 204 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 205 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 206 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 207 // In practice, however, the # of lock sites is bounded and is usually small.
 208 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 209 // if the processor uses simple bimodal branch predictors keyed by EIP
 210 // Since the helper routines would be called from multiple synchronization
 211 // sites.
 212 //
 213 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 214 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 215 // to those specialized methods.  That'd give us a mostly platform-independent
 216 // implementation that the JITs could optimize and inline at their pleasure.
 217 // Done correctly, the only time we'd need to cross to native could would be
 218 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 219 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 220 // (b) explicit barriers or fence operations.
 221 //
 222 // TODO:
 223 //
 224 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 225 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 226 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 227 //    the lock operators would typically be faster than reifying Self.
 228 //
 229 // *  Ideally I'd define the primitives as:
 230 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 231 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 232 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 233 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 234 //    Furthermore the register assignments are overconstrained, possibly resulting in
 235 //    sub-optimal code near the synchronization site.
 236 //
 237 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 238 //    Alternately, use a better sp-proximity test.
 239 //
 240 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 241 //    Either one is sufficient to uniquely identify a thread.
 242 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 243 //
 244 // *  Intrinsify notify() and notifyAll() for the common cases where the
 245 //    object is locked by the calling thread but the waitlist is empty.
 246 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 247 //
 248 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 249 //    But beware of excessive branch density on AMD Opterons.
 250 //
 251 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 252 //    or failure of the fast path.  If the fast path fails then we pass
 253 //    control to the slow path, typically in C.  In fast_lock and
 254 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 255 //    will emit a conditional branch immediately after the node.
 256 //    So we have branches to branches and lots of ICC.ZF games.
 257 //    Instead, it might be better to have C2 pass a "FailureLabel"
 258 //    into fast_lock and fast_unlock.  In the case of success, control
 259 //    will drop through the node.  ICC.ZF is undefined at exit.
 260 //    In the case of failure, the node will branch directly to the
 261 //    FailureLabel
 262 
 263 // obj: object to lock
 264 // box: on-stack box address -- KILLED
 265 // rax: tmp -- KILLED
 266 // t  : tmp -- KILLED
 267 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register rax_reg,
 268                                   Register t, Register thread) {
 269   assert(rax_reg == rax, "Used for CAS");
 270   assert_different_registers(obj, box, rax_reg, t, thread);
 271 
 272   // Handle inflated monitor.
 273   Label inflated;
 274   // Finish fast lock successfully. ZF value is irrelevant.
 275   Label locked;
 276   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 277   Label slow_path;
 278 
 279   if (UseObjectMonitorTable) {
 280     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 281     movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
 282   }
 283 
 284   if (DiagnoseSyncOnValueBasedClasses != 0) {
 285     load_klass(rax_reg, obj, t);
 286     testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 287     jcc(Assembler::notZero, slow_path);
 288   }
 289 
 290   const Register mark = t;
 291 
 292   { // Fast Lock
 293 
 294     Label push;
 295 
 296     const Register top = UseObjectMonitorTable ? rax_reg : box;
 297 
 298     // Load the mark.
 299     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 300 
 301     // Prefetch top.
 302     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 303 
 304     // Check for monitor (0b10).
 305     testptr(mark, markWord::monitor_value);
 306     jcc(Assembler::notZero, inflated);
 307 
 308     // Check if lock-stack is full.
 309     cmpl(top, LockStack::end_offset() - 1);
 310     jcc(Assembler::greater, slow_path);
 311 
 312     // Check if recursive.
 313     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 314     jccb(Assembler::equal, push);
 315 
 316     // Try to lock. Transition lock bits 0b01 => 0b00
 317     movptr(rax_reg, mark);
 318     orptr(rax_reg, markWord::unlocked_value);
 319     andptr(mark, ~(int32_t)markWord::unlocked_value);
 320     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 321     jcc(Assembler::notEqual, slow_path);
 322 
 323     if (UseObjectMonitorTable) {
 324       // Need to reload top, clobbered by CAS.
 325       movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 326     }
 327     bind(push);
 328     // After successful lock, push object on lock-stack.
 329     movptr(Address(thread, top), obj);
 330     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 331     jmp(locked);
 332   }
 333 
 334   { // Handle inflated monitor.
 335     bind(inflated);
 336 
 337     const Register monitor = t;
 338 
 339     if (!UseObjectMonitorTable) {
 340       assert(mark == monitor, "should be the same here");
 341     } else {
 342       const Register hash = t;
 343       Label monitor_found;
 344 
 345       // Look for the monitor in the om_cache.
 346 
 347       ByteSize cache_offset   = JavaThread::om_cache_oops_offset();
 348       ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
 349       const int num_unrolled  = OMCache::CAPACITY;
 350       for (int i = 0; i < num_unrolled; i++) {
 351         movptr(monitor, Address(thread,  cache_offset + monitor_offset));
 352         cmpptr(obj, Address(thread, cache_offset));
 353         jccb(Assembler::equal, monitor_found);
 354         cache_offset = cache_offset + OMCache::oop_to_oop_difference();
 355       }
 356 
 357       // Look for the monitor in the table.
 358 
 359       // Get the hash code.
 360       movptr(hash, Address(obj, oopDesc::mark_offset_in_bytes()));
 361       shrq(hash, markWord::hash_shift);
 362       andq(hash, markWord::hash_mask);
 363 
 364       // Get the table and calculate the bucket's address.
 365       lea(rax_reg, ExternalAddress(ObjectMonitorTable::current_table_address()));
 366       movptr(rax_reg, Address(rax_reg));
 367       andq(hash, Address(rax_reg, ObjectMonitorTable::table_capacity_mask_offset()));
 368       movptr(rax_reg, Address(rax_reg, ObjectMonitorTable::table_buckets_offset()));
 369 
 370       // Read the monitor from the bucket.
 371       movptr(monitor, Address(rax_reg, hash, Address::times_ptr));
 372 
 373       // Check if the monitor in the bucket is special (empty, tombstone or removed)
 374       cmpptr(monitor, ObjectMonitorTable::SpecialPointerValues::below_is_special);
 375       jcc(Assembler::below, slow_path);
 376 
 377       // Check if object matches.
 378       movptr(rax_reg, Address(monitor, ObjectMonitor::object_offset()));
 379       BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 380       bs_asm->try_resolve_weak_handle_in_c2(this, rax_reg, slow_path);
 381       cmpptr(rax_reg, obj);
 382       jcc(Assembler::notEqual, slow_path);
 383 
 384       bind(monitor_found);
 385     }
 386     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 387     const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 388     const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
 389 
 390     Label monitor_locked;
 391     // Lock the monitor.
 392 
 393     if (UseObjectMonitorTable) {
 394       // Cache the monitor for unlock before trashing box. On failure to acquire
 395       // the lock, the slow path will reset the entry accordingly (see CacheSetter).
 396       movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
 397     }
 398 
 399     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 400     xorptr(rax_reg, rax_reg);
 401     movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
 402     lock(); cmpxchgptr(box, owner_address);
 403     jccb(Assembler::equal, monitor_locked);
 404 
 405     // Check if recursive.
 406     cmpptr(box, rax_reg);
 407     jccb(Assembler::notEqual, slow_path);
 408 
 409     // Recursive.
 410     increment(recursions_address);
 411 
 412     bind(monitor_locked);
 413   }
 414 
 415   bind(locked);
 416   // Set ZF = 1
 417   xorl(rax_reg, rax_reg);
 418 
 419 #ifdef ASSERT
 420   // Check that locked label is reached with ZF set.
 421   Label zf_correct;
 422   Label zf_bad_zero;
 423   jcc(Assembler::zero, zf_correct);
 424   jmp(zf_bad_zero);
 425 #endif
 426 
 427   bind(slow_path);
 428 #ifdef ASSERT
 429   // Check that slow_path label is reached with ZF not set.
 430   jcc(Assembler::notZero, zf_correct);
 431   stop("Fast Lock ZF != 0");
 432   bind(zf_bad_zero);
 433   stop("Fast Lock ZF != 1");
 434   bind(zf_correct);
 435 #endif
 436   // C2 uses the value of ZF to determine the continuation.
 437 }
 438 
 439 // obj: object to lock
 440 // rax: tmp -- KILLED
 441 // t  : tmp - cannot be obj nor rax -- KILLED
 442 //
 443 // Some commentary on balanced locking:
 444 //
 445 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 446 // Methods that don't have provably balanced locking are forced to run in the
 447 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 448 // The interpreter provides two properties:
 449 // I1:  At return-time the interpreter automatically and quietly unlocks any
 450 //      objects acquired in the current activation (frame).  Recall that the
 451 //      interpreter maintains an on-stack list of locks currently held by
 452 //      a frame.
 453 // I2:  If a method attempts to unlock an object that is not held by the
 454 //      frame the interpreter throws IMSX.
 455 //
 456 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 457 // B() doesn't have provably balanced locking so it runs in the interpreter.
 458 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 459 // is still locked by A().
 460 //
 461 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface
 462 // Specification" states that an object locked by JNI's MonitorEnter should not be
 463 // unlocked by "normal" java-level locking and vice-versa.  The specification doesn't
 464 // specify what will occur if a program engages in such mixed-mode locking, however.
 465 // Arguably given that the spec legislates the JNI case as undefined our implementation
 466 // could reasonably *avoid* checking owner in fast_unlock().
 467 // In the interest of performance we elide m->Owner==Self check in unlock.
 468 // A perfectly viable alternative is to elide the owner check except when
 469 // Xcheck:jni is enabled.
 470 
 471 void C2_MacroAssembler::fast_unlock(Register obj, Register reg_rax, Register t, Register thread) {
 472   assert(reg_rax == rax, "Used for CAS");
 473   assert_different_registers(obj, reg_rax, t);
 474 
 475   // Handle inflated monitor.
 476   Label inflated, inflated_check_lock_stack;
 477   // Finish fast unlock successfully.  MUST jump with ZF == 1
 478   Label unlocked, slow_path;
 479 
 480   const Register mark = t;
 481   const Register monitor = t;
 482   const Register top = UseObjectMonitorTable ? t : reg_rax;
 483   const Register box = reg_rax;
 484 
 485   Label dummy;
 486   C2FastUnlockStub* stub = nullptr;
 487 
 488   if (!Compile::current()->output()->in_scratch_emit_size()) {
 489     stub = new (Compile::current()->comp_arena()) C2FastUnlockStub(obj, mark, reg_rax, thread);
 490     Compile::current()->output()->add_stub(stub);
 491   }
 492 
 493   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 494 
 495   { // Fast Unlock
 496 
 497     // Load top.
 498     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 499 
 500     if (!UseObjectMonitorTable) {
 501       // Prefetch mark.
 502       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 503     }
 504 
 505     // Check if obj is top of lock-stack.
 506     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 507     // Top of lock stack was not obj. Must be monitor.
 508     jcc(Assembler::notEqual, inflated_check_lock_stack);
 509 
 510     // Pop lock-stack.
 511     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 512     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 513 
 514     // Check if recursive.
 515     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 516     jcc(Assembler::equal, unlocked);
 517 
 518     // We elide the monitor check, let the CAS fail instead.
 519 
 520     if (UseObjectMonitorTable) {
 521       // Load mark.
 522       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 523     }
 524 
 525     // Try to unlock. Transition lock bits 0b00 => 0b01
 526     movptr(reg_rax, mark);
 527     andptr(reg_rax, ~(int32_t)markWord::lock_mask_in_place);
 528     orptr(mark, markWord::unlocked_value);
 529     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 530     jcc(Assembler::notEqual, push_and_slow_path);
 531     jmp(unlocked);
 532   }
 533 
 534 
 535   { // Handle inflated monitor.
 536     bind(inflated_check_lock_stack);
 537 #ifdef ASSERT
 538     Label check_done;
 539     subl(top, oopSize);
 540     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
 541     jcc(Assembler::below, check_done);
 542     cmpptr(obj, Address(thread, top));
 543     jcc(Assembler::notEqual, inflated_check_lock_stack);
 544     stop("Fast Unlock lock on stack");
 545     bind(check_done);
 546     if (UseObjectMonitorTable) {
 547       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 548     }
 549     testptr(mark, markWord::monitor_value);
 550     jcc(Assembler::notZero, inflated);
 551     stop("Fast Unlock not monitor");
 552 #endif
 553 
 554     bind(inflated);
 555 
 556     if (!UseObjectMonitorTable) {
 557       assert(mark == monitor, "should be the same here");
 558     } else {
 559       // Uses ObjectMonitorTable.  Look for the monitor in our BasicLock on the stack.
 560       movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 561       // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
 562       cmpptr(monitor, alignof(ObjectMonitor*));
 563       jcc(Assembler::below, slow_path);
 564     }
 565     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 566     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
 567     const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
 568     const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag};
 569     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
 570 
 571     Label recursive;
 572 
 573     // Check if recursive.
 574     cmpptr(recursions_address, 0);
 575     jcc(Assembler::notZero, recursive);
 576 
 577     // Set owner to null.
 578     // Release to satisfy the JMM
 579     movptr(owner_address, NULL_WORD);
 580     // We need a full fence after clearing owner to avoid stranding.
 581     // StoreLoad achieves this.
 582     membar(StoreLoad);
 583 
 584     // Check if the entry_list is empty.
 585     cmpptr(entry_list_address, NULL_WORD);
 586     jcc(Assembler::zero, unlocked);    // If so we are done.
 587 
 588     // Check if there is a successor.
 589     cmpptr(succ_address, NULL_WORD);
 590     jcc(Assembler::notZero, unlocked); // If so we are done.
 591 
 592     // Save the monitor pointer in the current thread, so we can try to
 593     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 594     if (!UseObjectMonitorTable) {
 595       andptr(monitor, ~(int32_t)markWord::monitor_value);
 596     }
 597     movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
 598 
 599     orl(t, 1); // Fast Unlock ZF = 0
 600     jmpb(slow_path);
 601 
 602     // Recursive unlock.
 603     bind(recursive);
 604     decrement(recursions_address);
 605   }
 606 
 607   bind(unlocked);
 608   xorl(t, t); // Fast Unlock ZF = 1
 609 
 610 #ifdef ASSERT
 611   // Check that unlocked label is reached with ZF set.
 612   Label zf_correct;
 613   Label zf_bad_zero;
 614   jcc(Assembler::zero, zf_correct);
 615   jmp(zf_bad_zero);
 616 #endif
 617 
 618   bind(slow_path);
 619   if (stub != nullptr) {
 620     bind(stub->slow_path_continuation());
 621   }
 622 #ifdef ASSERT
 623   // Check that stub->continuation() label is reached with ZF not set.
 624   jcc(Assembler::notZero, zf_correct);
 625   stop("Fast Unlock ZF != 0");
 626   bind(zf_bad_zero);
 627   stop("Fast Unlock ZF != 1");
 628   bind(zf_correct);
 629 #endif
 630   // C2 uses the value of ZF to determine the continuation.
 631 }
 632 
 633 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
 634   fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
 635 }
 636 
 637 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) {
 638   const int framesize = Compile::current()->output()->frame_size_in_bytes();
 639   masm->movptr(dst, rsp);
 640   if (framesize > 2 * wordSize) {
 641     masm->addptr(dst, framesize - 2 * wordSize);
 642   }
 643 }
 644 
 645 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
 646   if (PreserveFramePointer) {
 647     // frame pointer is valid
 648 #ifdef ASSERT
 649     // Verify frame pointer value in rbp.
 650     reconstruct_frame_pointer_helper(this, rtmp);
 651     Label L_success;
 652     cmpq(rbp, rtmp);
 653     jccb(Assembler::equal, L_success);
 654     STOP("frame pointer mismatch");
 655     bind(L_success);
 656 #endif // ASSERT
 657   } else {
 658     reconstruct_frame_pointer_helper(this, rbp);
 659   }
 660 }
 661 
 662 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) {
 663   jint lo = t->_lo;
 664   jint hi = t->_hi;
 665   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi);
 666   if (t == TypeInt::INT) {
 667     return;
 668   }
 669 
 670   BLOCK_COMMENT("CastII {");
 671   Label fail;
 672   Label succeed;
 673 
 674   if (lo != min_jint) {
 675     cmpl(val, lo);
 676     jccb(Assembler::less, fail);
 677   }
 678   if (hi != max_jint) {
 679     cmpl(val, hi);
 680     jccb(Assembler::greater, fail);
 681   }
 682   jmpb(succeed);
 683 
 684   bind(fail);
 685   movl(c_rarg0, idx);
 686   movl(c_rarg1, val);
 687   movl(c_rarg2, lo);
 688   movl(c_rarg3, hi);
 689   reconstruct_frame_pointer(rscratch1);
 690   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range)));
 691   hlt();
 692   bind(succeed);
 693   BLOCK_COMMENT("} // CastII");
 694 }
 695 
 696 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
 697   fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
 698 }
 699 
 700 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) {
 701   jlong lo = t->_lo;
 702   jlong hi = t->_hi;
 703   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi);
 704   if (t == TypeLong::LONG) {
 705     return;
 706   }
 707 
 708   BLOCK_COMMENT("CastLL {");
 709   Label fail;
 710   Label succeed;
 711 
 712   auto cmp_val = [&](jlong bound) {
 713     if (is_simm32(bound)) {
 714       cmpq(val, checked_cast<int>(bound));
 715     } else {
 716       mov64(tmp, bound);
 717       cmpq(val, tmp);
 718     }
 719   };
 720 
 721   if (lo != min_jlong) {
 722     cmp_val(lo);
 723     jccb(Assembler::less, fail);
 724   }
 725   if (hi != max_jlong) {
 726     cmp_val(hi);
 727     jccb(Assembler::greater, fail);
 728   }
 729   jmpb(succeed);
 730 
 731   bind(fail);
 732   movl(c_rarg0, idx);
 733   movq(c_rarg1, val);
 734   mov64(c_rarg2, lo);
 735   mov64(c_rarg3, hi);
 736   reconstruct_frame_pointer(rscratch1);
 737   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range)));
 738   hlt();
 739   bind(succeed);
 740   BLOCK_COMMENT("} // CastLL");
 741 }
 742 
 743 //-------------------------------------------------------------------------------------------
 744 // Generic instructions support for use in .ad files C2 code generation
 745 
 746 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 747   if (dst != src) {
 748     movdqu(dst, src);
 749   }
 750   if (opcode == Op_AbsVD) {
 751     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 752   } else {
 753     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 754     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 755   }
 756 }
 757 
 758 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 759   if (opcode == Op_AbsVD) {
 760     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 761   } else {
 762     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 763     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 764   }
 765 }
 766 
 767 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 768   if (dst != src) {
 769     movdqu(dst, src);
 770   }
 771   if (opcode == Op_AbsVF) {
 772     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 773   } else {
 774     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 775     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 776   }
 777 }
 778 
 779 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 780   if (opcode == Op_AbsVF) {
 781     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 782   } else {
 783     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 784     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 785   }
 786 }
 787 
 788 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 789   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 790   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 791 
 792   if (opcode == Op_MinV) {
 793     if (elem_bt == T_BYTE) {
 794       pminsb(dst, src);
 795     } else if (elem_bt == T_SHORT) {
 796       pminsw(dst, src);
 797     } else if (elem_bt == T_INT) {
 798       pminsd(dst, src);
 799     } else {
 800       assert(elem_bt == T_LONG, "required");
 801       assert(tmp == xmm0, "required");
 802       assert_different_registers(dst, src, tmp);
 803       movdqu(xmm0, dst);
 804       pcmpgtq(xmm0, src);
 805       blendvpd(dst, src);  // xmm0 as mask
 806     }
 807   } else { // opcode == Op_MaxV
 808     if (elem_bt == T_BYTE) {
 809       pmaxsb(dst, src);
 810     } else if (elem_bt == T_SHORT) {
 811       pmaxsw(dst, src);
 812     } else if (elem_bt == T_INT) {
 813       pmaxsd(dst, src);
 814     } else {
 815       assert(elem_bt == T_LONG, "required");
 816       assert(tmp == xmm0, "required");
 817       assert_different_registers(dst, src, tmp);
 818       movdqu(xmm0, src);
 819       pcmpgtq(xmm0, dst);
 820       blendvpd(dst, src);  // xmm0 as mask
 821     }
 822   }
 823 }
 824 
 825 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 826                                   XMMRegister src1, Address src2, int vlen_enc) {
 827   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 828   if (opcode == Op_UMinV) {
 829     switch(elem_bt) {
 830       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 831       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 832       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 833       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 834       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 835     }
 836   } else {
 837     assert(opcode == Op_UMaxV, "required");
 838     switch(elem_bt) {
 839       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 840       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 841       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 842       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 843       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 844     }
 845   }
 846 }
 847 
 848 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
 849   // For optimality, leverage a full vector width of 512 bits
 850   // for operations over smaller vector sizes on AVX512 targets.
 851   if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 852     if (opcode == Op_UMaxV) {
 853       evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 854     } else {
 855       assert(opcode == Op_UMinV, "required");
 856       evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 857     }
 858   } else {
 859     // T1 = -1
 860     vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
 861     // T1 = -1 << 63
 862     vpsllq(xtmp1, xtmp1, 63, vlen_enc);
 863     // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
 864     vpaddq(xtmp2, xtmp1, src2, vlen_enc);
 865     // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
 866     vpaddq(xtmp1, xtmp1, src1, vlen_enc);
 867     // Mask = T2 > T1
 868     vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
 869     if (opcode == Op_UMaxV) {
 870       // Res = Mask ? Src2 : Src1
 871       vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
 872     } else {
 873       // Res = Mask ? Src1 : Src2
 874       vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
 875     }
 876   }
 877 }
 878 
 879 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 880                                   XMMRegister src1, XMMRegister src2, int vlen_enc) {
 881   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 882   if (opcode == Op_UMinV) {
 883     switch(elem_bt) {
 884       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 885       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 886       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 887       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 888       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 889     }
 890   } else {
 891     assert(opcode == Op_UMaxV, "required");
 892     switch(elem_bt) {
 893       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 894       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 895       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 896       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 897       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 898     }
 899   }
 900 }
 901 
 902 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 903                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 904                                  int vlen_enc) {
 905   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 906 
 907   if (opcode == Op_MinV) {
 908     if (elem_bt == T_BYTE) {
 909       vpminsb(dst, src1, src2, vlen_enc);
 910     } else if (elem_bt == T_SHORT) {
 911       vpminsw(dst, src1, src2, vlen_enc);
 912     } else if (elem_bt == T_INT) {
 913       vpminsd(dst, src1, src2, vlen_enc);
 914     } else {
 915       assert(elem_bt == T_LONG, "required");
 916       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 917         vpminsq(dst, src1, src2, vlen_enc);
 918       } else {
 919         assert_different_registers(dst, src1, src2);
 920         vpcmpgtq(dst, src1, src2, vlen_enc);
 921         vblendvpd(dst, src1, src2, dst, vlen_enc);
 922       }
 923     }
 924   } else { // opcode == Op_MaxV
 925     if (elem_bt == T_BYTE) {
 926       vpmaxsb(dst, src1, src2, vlen_enc);
 927     } else if (elem_bt == T_SHORT) {
 928       vpmaxsw(dst, src1, src2, vlen_enc);
 929     } else if (elem_bt == T_INT) {
 930       vpmaxsd(dst, src1, src2, vlen_enc);
 931     } else {
 932       assert(elem_bt == T_LONG, "required");
 933       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 934         vpmaxsq(dst, src1, src2, vlen_enc);
 935       } else {
 936         assert_different_registers(dst, src1, src2);
 937         vpcmpgtq(dst, src1, src2, vlen_enc);
 938         vblendvpd(dst, src2, src1, dst, vlen_enc);
 939       }
 940     }
 941   }
 942 }
 943 
 944 // Float/Double min max
 945 
 946 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
 947                                    XMMRegister dst, XMMRegister a, XMMRegister b,
 948                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
 949                                    int vlen_enc) {
 950   assert(UseAVX > 0, "required");
 951   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 952          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 953   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 954   assert_different_registers(a, tmp, atmp, btmp);
 955   assert_different_registers(b, tmp, atmp, btmp);
 956 
 957   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 958   bool is_double_word = is_double_word_type(elem_bt);
 959 
 960   /* Note on 'non-obvious' assembly sequence:
 961    *
 962    * While there are vminps/vmaxps instructions, there are two important differences between hardware
 963    * and Java on how they handle floats:
 964    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
 965    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
 966    *
 967    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
 968    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
 969    *                (only useful when signs differ, noop otherwise)
 970    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
 971 
 972    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
 973    *   btmp = (b < +0.0) ? a : b
 974    *   atmp = (b < +0.0) ? b : a
 975    *   Tmp  = Max_Float(atmp , btmp)
 976    *   Res  = (atmp == NaN) ? atmp : Tmp
 977    */
 978 
 979   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
 980   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
 981   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
 982   XMMRegister mask;
 983 
 984   if (!is_double_word && is_min) {
 985     mask = a;
 986     vblend = &MacroAssembler::vblendvps;
 987     vmaxmin = &MacroAssembler::vminps;
 988     vcmp = &MacroAssembler::vcmpps;
 989   } else if (!is_double_word && !is_min) {
 990     mask = b;
 991     vblend = &MacroAssembler::vblendvps;
 992     vmaxmin = &MacroAssembler::vmaxps;
 993     vcmp = &MacroAssembler::vcmpps;
 994   } else if (is_double_word && is_min) {
 995     mask = a;
 996     vblend = &MacroAssembler::vblendvpd;
 997     vmaxmin = &MacroAssembler::vminpd;
 998     vcmp = &MacroAssembler::vcmppd;
 999   } else {
1000     assert(is_double_word && !is_min, "sanity");
1001     mask = b;
1002     vblend = &MacroAssembler::vblendvpd;
1003     vmaxmin = &MacroAssembler::vmaxpd;
1004     vcmp = &MacroAssembler::vcmppd;
1005   }
1006 
1007   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
1008   XMMRegister maxmin, scratch;
1009   if (dst == btmp) {
1010     maxmin = btmp;
1011     scratch = tmp;
1012   } else {
1013     maxmin = tmp;
1014     scratch = btmp;
1015   }
1016 
1017   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1018   if (precompute_mask && !is_double_word) {
1019     vpsrad(tmp, mask, 32, vlen_enc);
1020     mask = tmp;
1021   } else if (precompute_mask && is_double_word) {
1022     vpxor(tmp, tmp, tmp, vlen_enc);
1023     vpcmpgtq(tmp, tmp, mask, vlen_enc);
1024     mask = tmp;
1025   }
1026 
1027   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1028   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1029   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1030   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1031   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1032 }
1033 
1034 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1035                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1036                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1037                                     int vlen_enc) {
1038   assert(UseAVX > 2, "required");
1039   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1040          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1041   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1042   assert_different_registers(dst, a, atmp, btmp);
1043   assert_different_registers(dst, b, atmp, btmp);
1044 
1045   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1046   bool is_double_word = is_double_word_type(elem_bt);
1047   bool merge = true;
1048 
1049   if (!is_double_word && is_min) {
1050     evpmovd2m(ktmp, a, vlen_enc);
1051     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1052     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1053     vminps(dst, atmp, btmp, vlen_enc);
1054     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1055     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1056   } else if (!is_double_word && !is_min) {
1057     evpmovd2m(ktmp, b, vlen_enc);
1058     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1059     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1060     vmaxps(dst, atmp, btmp, vlen_enc);
1061     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1062     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1063   } else if (is_double_word && is_min) {
1064     evpmovq2m(ktmp, a, vlen_enc);
1065     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1066     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1067     vminpd(dst, atmp, btmp, vlen_enc);
1068     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1069     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1070   } else {
1071     assert(is_double_word && !is_min, "sanity");
1072     evpmovq2m(ktmp, b, vlen_enc);
1073     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1074     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1075     vmaxpd(dst, atmp, btmp, vlen_enc);
1076     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1077     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1078   }
1079 }
1080 
1081 void C2_MacroAssembler::vminmax_fp_avx10_2(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1082                                            XMMRegister src1, XMMRegister src2, int vlen_enc) {
1083   assert(opc == Op_MinV || opc == Op_MinReductionV ||
1084          opc == Op_MaxV || opc == Op_MaxReductionV, "sanity");
1085 
1086   int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_2_MINMAX_MIN_COMPARE_SIGN
1087                                                          : AVX10_2_MINMAX_MAX_COMPARE_SIGN;
1088   if (elem_bt == T_FLOAT) {
1089     evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc);
1090   } else {
1091     assert(elem_bt == T_DOUBLE, "");
1092     evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc);
1093   }
1094 }
1095 
1096 void C2_MacroAssembler::sminmax_fp_avx10_2(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1097                                            XMMRegister src1, XMMRegister src2) {
1098   assert(opc == Op_MinF || opc == Op_MaxF ||
1099          opc == Op_MinD || opc == Op_MaxD, "sanity");
1100 
1101   int imm8 = (opc == Op_MinF || opc == Op_MinD) ? AVX10_2_MINMAX_MIN_COMPARE_SIGN
1102                                                 : AVX10_2_MINMAX_MAX_COMPARE_SIGN;
1103   if (elem_bt == T_FLOAT) {
1104     evminmaxss(dst, mask, src1, src2, true, imm8);
1105   } else {
1106     assert(elem_bt == T_DOUBLE, "");
1107     evminmaxsd(dst, mask, src1, src2, true, imm8);
1108   }
1109 }
1110 
1111 // Float/Double signum
1112 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1113   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1114 
1115   Label DONE_LABEL;
1116 
1117   // Handle special cases +0.0/-0.0 and NaN, if argument is +0.0/-0.0 or NaN, return argument
1118   // If AVX10.2 (or newer) floating point comparison instructions used, SF=1 for equal and unordered cases
1119   // If other floating point comparison instructions used, ZF=1 for equal and unordered cases
1120   if (opcode == Op_SignumF) {
1121     if (VM_Version::supports_avx10_2()) {
1122       evucomxss(dst, zero);
1123       jcc(Assembler::negative, DONE_LABEL);
1124     } else {
1125       ucomiss(dst, zero);
1126       jcc(Assembler::equal, DONE_LABEL);
1127     }
1128     movflt(dst, one);
1129     jcc(Assembler::above, DONE_LABEL);
1130     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1131   } else if (opcode == Op_SignumD) {
1132     if (VM_Version::supports_avx10_2()) {
1133       evucomxsd(dst, zero);
1134       jcc(Assembler::negative, DONE_LABEL);
1135     } else {
1136       ucomisd(dst, zero);
1137       jcc(Assembler::equal, DONE_LABEL);
1138     }
1139     movdbl(dst, one);
1140     jcc(Assembler::above, DONE_LABEL);
1141     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1142   }
1143 
1144   bind(DONE_LABEL);
1145 }
1146 
1147 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1148   if (sign) {
1149     pmovsxbw(dst, src);
1150   } else {
1151     pmovzxbw(dst, src);
1152   }
1153 }
1154 
1155 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1156   if (sign) {
1157     vpmovsxbw(dst, src, vector_len);
1158   } else {
1159     vpmovzxbw(dst, src, vector_len);
1160   }
1161 }
1162 
1163 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1164   if (sign) {
1165     vpmovsxbd(dst, src, vector_len);
1166   } else {
1167     vpmovzxbd(dst, src, vector_len);
1168   }
1169 }
1170 
1171 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1172   if (sign) {
1173     vpmovsxwd(dst, src, vector_len);
1174   } else {
1175     vpmovzxwd(dst, src, vector_len);
1176   }
1177 }
1178 
1179 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1180                                      int shift, int vector_len) {
1181   if (opcode == Op_RotateLeftV) {
1182     if (etype == T_INT) {
1183       evprold(dst, src, shift, vector_len);
1184     } else {
1185       assert(etype == T_LONG, "expected type T_LONG");
1186       evprolq(dst, src, shift, vector_len);
1187     }
1188   } else {
1189     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1190     if (etype == T_INT) {
1191       evprord(dst, src, shift, vector_len);
1192     } else {
1193       assert(etype == T_LONG, "expected type T_LONG");
1194       evprorq(dst, src, shift, vector_len);
1195     }
1196   }
1197 }
1198 
1199 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1200                                      XMMRegister shift, int vector_len) {
1201   if (opcode == Op_RotateLeftV) {
1202     if (etype == T_INT) {
1203       evprolvd(dst, src, shift, vector_len);
1204     } else {
1205       assert(etype == T_LONG, "expected type T_LONG");
1206       evprolvq(dst, src, shift, vector_len);
1207     }
1208   } else {
1209     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1210     if (etype == T_INT) {
1211       evprorvd(dst, src, shift, vector_len);
1212     } else {
1213       assert(etype == T_LONG, "expected type T_LONG");
1214       evprorvq(dst, src, shift, vector_len);
1215     }
1216   }
1217 }
1218 
1219 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1220   if (opcode == Op_RShiftVI) {
1221     psrad(dst, shift);
1222   } else if (opcode == Op_LShiftVI) {
1223     pslld(dst, shift);
1224   } else {
1225     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1226     psrld(dst, shift);
1227   }
1228 }
1229 
1230 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1231   switch (opcode) {
1232     case Op_RShiftVI:  psrad(dst, shift); break;
1233     case Op_LShiftVI:  pslld(dst, shift); break;
1234     case Op_URShiftVI: psrld(dst, shift); break;
1235 
1236     default: assert(false, "%s", NodeClassNames[opcode]);
1237   }
1238 }
1239 
1240 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1241   if (opcode == Op_RShiftVI) {
1242     vpsrad(dst, nds, shift, vector_len);
1243   } else if (opcode == Op_LShiftVI) {
1244     vpslld(dst, nds, shift, vector_len);
1245   } else {
1246     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1247     vpsrld(dst, nds, shift, vector_len);
1248   }
1249 }
1250 
1251 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1252   switch (opcode) {
1253     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1254     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1255     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1256 
1257     default: assert(false, "%s", NodeClassNames[opcode]);
1258   }
1259 }
1260 
1261 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1262   switch (opcode) {
1263     case Op_RShiftVB:  // fall-through
1264     case Op_RShiftVS:  psraw(dst, shift); break;
1265 
1266     case Op_LShiftVB:  // fall-through
1267     case Op_LShiftVS:  psllw(dst, shift);   break;
1268 
1269     case Op_URShiftVS: // fall-through
1270     case Op_URShiftVB: psrlw(dst, shift);  break;
1271 
1272     default: assert(false, "%s", NodeClassNames[opcode]);
1273   }
1274 }
1275 
1276 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1277   switch (opcode) {
1278     case Op_RShiftVB:  // fall-through
1279     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1280 
1281     case Op_LShiftVB:  // fall-through
1282     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1283 
1284     case Op_URShiftVS: // fall-through
1285     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1286 
1287     default: assert(false, "%s", NodeClassNames[opcode]);
1288   }
1289 }
1290 
1291 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1292   switch (opcode) {
1293     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1294     case Op_LShiftVL:  psllq(dst, shift); break;
1295     case Op_URShiftVL: psrlq(dst, shift); break;
1296 
1297     default: assert(false, "%s", NodeClassNames[opcode]);
1298   }
1299 }
1300 
1301 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1302   if (opcode == Op_RShiftVL) {
1303     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1304   } else if (opcode == Op_LShiftVL) {
1305     psllq(dst, shift);
1306   } else {
1307     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1308     psrlq(dst, shift);
1309   }
1310 }
1311 
1312 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1313   switch (opcode) {
1314     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1315     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1316     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1317 
1318     default: assert(false, "%s", NodeClassNames[opcode]);
1319   }
1320 }
1321 
1322 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1323   if (opcode == Op_RShiftVL) {
1324     evpsraq(dst, nds, shift, vector_len);
1325   } else if (opcode == Op_LShiftVL) {
1326     vpsllq(dst, nds, shift, vector_len);
1327   } else {
1328     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1329     vpsrlq(dst, nds, shift, vector_len);
1330   }
1331 }
1332 
1333 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1334   switch (opcode) {
1335     case Op_RShiftVB:  // fall-through
1336     case Op_RShiftVS:  // fall-through
1337     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1338 
1339     case Op_LShiftVB:  // fall-through
1340     case Op_LShiftVS:  // fall-through
1341     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1342 
1343     case Op_URShiftVB: // fall-through
1344     case Op_URShiftVS: // fall-through
1345     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1346 
1347     default: assert(false, "%s", NodeClassNames[opcode]);
1348   }
1349 }
1350 
1351 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1352   switch (opcode) {
1353     case Op_RShiftVB:  // fall-through
1354     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1355 
1356     case Op_LShiftVB:  // fall-through
1357     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1358 
1359     case Op_URShiftVB: // fall-through
1360     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1361 
1362     default: assert(false, "%s", NodeClassNames[opcode]);
1363   }
1364 }
1365 
1366 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1367   assert(UseAVX >= 2, "required");
1368   switch (opcode) {
1369     case Op_RShiftVL: {
1370       if (UseAVX > 2) {
1371         assert(tmp == xnoreg, "not used");
1372         if (!VM_Version::supports_avx512vl()) {
1373           vlen_enc = Assembler::AVX_512bit;
1374         }
1375         evpsravq(dst, src, shift, vlen_enc);
1376       } else {
1377         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1378         vpsrlvq(dst, src, shift, vlen_enc);
1379         vpsrlvq(tmp, tmp, shift, vlen_enc);
1380         vpxor(dst, dst, tmp, vlen_enc);
1381         vpsubq(dst, dst, tmp, vlen_enc);
1382       }
1383       break;
1384     }
1385     case Op_LShiftVL: {
1386       assert(tmp == xnoreg, "not used");
1387       vpsllvq(dst, src, shift, vlen_enc);
1388       break;
1389     }
1390     case Op_URShiftVL: {
1391       assert(tmp == xnoreg, "not used");
1392       vpsrlvq(dst, src, shift, vlen_enc);
1393       break;
1394     }
1395     default: assert(false, "%s", NodeClassNames[opcode]);
1396   }
1397 }
1398 
1399 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1400 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1401   assert(opcode == Op_LShiftVB ||
1402          opcode == Op_RShiftVB ||
1403          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1404   bool sign = (opcode != Op_URShiftVB);
1405   assert(vector_len == 0, "required");
1406   vextendbd(sign, dst, src, 1);
1407   vpmovzxbd(vtmp, shift, 1);
1408   varshiftd(opcode, dst, dst, vtmp, 1);
1409   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1410   vextracti128_high(vtmp, dst);
1411   vpackusdw(dst, dst, vtmp, 0);
1412 }
1413 
1414 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1415 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1416   assert(opcode == Op_LShiftVB ||
1417          opcode == Op_RShiftVB ||
1418          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1419   bool sign = (opcode != Op_URShiftVB);
1420   int ext_vector_len = vector_len + 1;
1421   vextendbw(sign, dst, src, ext_vector_len);
1422   vpmovzxbw(vtmp, shift, ext_vector_len);
1423   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1424   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1425   if (vector_len == 0) {
1426     vextracti128_high(vtmp, dst);
1427     vpackuswb(dst, dst, vtmp, vector_len);
1428   } else {
1429     vextracti64x4_high(vtmp, dst);
1430     vpackuswb(dst, dst, vtmp, vector_len);
1431     vpermq(dst, dst, 0xD8, vector_len);
1432   }
1433 }
1434 
1435 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1436   switch(typ) {
1437     case T_BYTE:
1438       pinsrb(dst, val, idx);
1439       break;
1440     case T_SHORT:
1441       pinsrw(dst, val, idx);
1442       break;
1443     case T_INT:
1444       pinsrd(dst, val, idx);
1445       break;
1446     case T_LONG:
1447       pinsrq(dst, val, idx);
1448       break;
1449     default:
1450       assert(false,"Should not reach here.");
1451       break;
1452   }
1453 }
1454 
1455 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1456   switch(typ) {
1457     case T_BYTE:
1458       vpinsrb(dst, src, val, idx);
1459       break;
1460     case T_SHORT:
1461       vpinsrw(dst, src, val, idx);
1462       break;
1463     case T_INT:
1464       vpinsrd(dst, src, val, idx);
1465       break;
1466     case T_LONG:
1467       vpinsrq(dst, src, val, idx);
1468       break;
1469     default:
1470       assert(false,"Should not reach here.");
1471       break;
1472   }
1473 }
1474 
1475 void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst,
1476                                          Register base, Register idx_base,
1477                                          Register mask, Register mask_idx,
1478                                          Register rtmp, int vlen_enc) {
1479   vpxor(dst, dst, dst, vlen_enc);
1480   if (elem_bt == T_SHORT) {
1481     for (int i = 0; i < 4; i++) {
1482       // dst[i] = mask[i] ? src[idx_base[i]] : 0
1483       Label skip_load;
1484       btq(mask, mask_idx);
1485       jccb(Assembler::carryClear, skip_load);
1486       movl(rtmp, Address(idx_base, i * 4));
1487       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1488       bind(skip_load);
1489       incq(mask_idx);
1490     }
1491   } else {
1492     assert(elem_bt == T_BYTE, "");
1493     for (int i = 0; i < 8; i++) {
1494       // dst[i] = mask[i] ? src[idx_base[i]] : 0
1495       Label skip_load;
1496       btq(mask, mask_idx);
1497       jccb(Assembler::carryClear, skip_load);
1498       movl(rtmp, Address(idx_base, i * 4));
1499       pinsrb(dst, Address(base, rtmp), i);
1500       bind(skip_load);
1501       incq(mask_idx);
1502     }
1503   }
1504 }
1505 
1506 void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst,
1507                                   Register base, Register idx_base,
1508                                   Register rtmp, int vlen_enc) {
1509   vpxor(dst, dst, dst, vlen_enc);
1510   if (elem_bt == T_SHORT) {
1511     for (int i = 0; i < 4; i++) {
1512       // dst[i] = src[idx_base[i]]
1513       movl(rtmp, Address(idx_base, i * 4));
1514       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1515     }
1516   } else {
1517     assert(elem_bt == T_BYTE, "");
1518     for (int i = 0; i < 8; i++) {
1519       // dst[i] = src[idx_base[i]]
1520       movl(rtmp, Address(idx_base, i * 4));
1521       pinsrb(dst, Address(base, rtmp), i);
1522     }
1523   }
1524 }
1525 
1526 /*
1527  * Gather using hybrid algorithm, first partially unroll scalar loop
1528  * to accumulate values from gather indices into a quad-word(64bit) slice.
1529  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1530  * permutation to place the slice into appropriate vector lane
1531  * locations in destination vector. Following pseudo code describes the
1532  * algorithm in detail:
1533  *
1534  * DST_VEC = ZERO_VEC
1535  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1536  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1537  * FOREACH_ITER:
1538  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1539  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1540  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1541  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1542  *
1543  * With each iteration, doubleword permute indices (0,1) corresponding
1544  * to gathered quadword gets right shifted by two lane positions.
1545  *
1546  */
1547 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1548                                         Register base, Register idx_base,
1549                                         Register mask, XMMRegister xtmp1,
1550                                         XMMRegister xtmp2, XMMRegister temp_dst,
1551                                         Register rtmp, Register mask_idx,
1552                                         Register length, int vector_len, int vlen_enc) {
1553   Label GATHER8_LOOP;
1554   assert(is_subword_type(elem_ty), "");
1555   movl(length, vector_len);
1556   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1557   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1558   vallones(xtmp2, vlen_enc);
1559   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1560   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1561   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1562 
1563   bind(GATHER8_LOOP);
1564     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1565     if (mask == noreg) {
1566       vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc);
1567     } else {
1568       vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc);
1569     }
1570     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1571     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1572     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1573     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1574     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1575     vpor(dst, dst, temp_dst, vlen_enc);
1576     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1577     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1578     jcc(Assembler::notEqual, GATHER8_LOOP);
1579 }
1580 
1581 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1582   switch(typ) {
1583     case T_INT:
1584       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1585       break;
1586     case T_FLOAT:
1587       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1588       break;
1589     case T_LONG:
1590       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1591       break;
1592     case T_DOUBLE:
1593       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1594       break;
1595     default:
1596       assert(false,"Should not reach here.");
1597       break;
1598   }
1599 }
1600 
1601 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1602   switch(typ) {
1603     case T_INT:
1604       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1605       break;
1606     case T_FLOAT:
1607       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1608       break;
1609     case T_LONG:
1610       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1611       break;
1612     case T_DOUBLE:
1613       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1614       break;
1615     default:
1616       assert(false,"Should not reach here.");
1617       break;
1618   }
1619 }
1620 
1621 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1622   switch(typ) {
1623     case T_INT:
1624       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1625       break;
1626     case T_FLOAT:
1627       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1628       break;
1629     case T_LONG:
1630       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1631       break;
1632     case T_DOUBLE:
1633       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1634       break;
1635     default:
1636       assert(false,"Should not reach here.");
1637       break;
1638   }
1639 }
1640 
1641 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1642   if (vlen_in_bytes <= 16) {
1643     pxor (dst, dst);
1644     psubb(dst, src);
1645     switch (elem_bt) {
1646       case T_BYTE:   /* nothing to do */ break;
1647       case T_SHORT:  pmovsxbw(dst, dst); break;
1648       case T_INT:    pmovsxbd(dst, dst); break;
1649       case T_FLOAT:  pmovsxbd(dst, dst); break;
1650       case T_LONG:   pmovsxbq(dst, dst); break;
1651       case T_DOUBLE: pmovsxbq(dst, dst); break;
1652 
1653       default: assert(false, "%s", type2name(elem_bt));
1654     }
1655   } else {
1656     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1657     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1658 
1659     vpxor (dst, dst, dst, vlen_enc);
1660     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1661 
1662     switch (elem_bt) {
1663       case T_BYTE:   /* nothing to do */            break;
1664       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1665       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1666       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1667       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1668       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1669 
1670       default: assert(false, "%s", type2name(elem_bt));
1671     }
1672   }
1673 }
1674 
1675 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1676   if (novlbwdq) {
1677     vpmovsxbd(xtmp, src, vlen_enc);
1678     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1679             Assembler::eq, true, vlen_enc, noreg);
1680   } else {
1681     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1682     vpsubb(xtmp, xtmp, src, vlen_enc);
1683     evpmovb2m(dst, xtmp, vlen_enc);
1684   }
1685 }
1686 
1687 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1688   if (is_integral_type(bt)) {
1689     switch (vlen_in_bytes) {
1690       case 4:  movdl(dst, src);   break;
1691       case 8:  movq(dst, src);    break;
1692       case 16: movdqu(dst, src);  break;
1693       case 32: vmovdqu(dst, src); break;
1694       case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1695       default: ShouldNotReachHere();
1696     }
1697   } else {
1698     switch (vlen_in_bytes) {
1699       case 4:  movflt(dst, src); break;
1700       case 8:  movdbl(dst, src); break;
1701       case 16: movups(dst, src); break;
1702       case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1703       case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1704       default: ShouldNotReachHere();
1705     }
1706   }
1707 }
1708 
1709 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1710   assert(rscratch != noreg || always_reachable(src), "missing");
1711 
1712   if (reachable(src)) {
1713     load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1714   } else {
1715     lea(rscratch, src);
1716     load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1717   }
1718 }
1719 
1720 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1721   int vlen_enc = vector_length_encoding(vlen);
1722   if (VM_Version::supports_avx()) {
1723     if (bt == T_LONG) {
1724       if (VM_Version::supports_avx2()) {
1725         vpbroadcastq(dst, src, vlen_enc);
1726       } else {
1727         vmovddup(dst, src, vlen_enc);
1728       }
1729     } else if (bt == T_DOUBLE) {
1730       if (vlen_enc != Assembler::AVX_128bit) {
1731         vbroadcastsd(dst, src, vlen_enc, noreg);
1732       } else {
1733         vmovddup(dst, src, vlen_enc);
1734       }
1735     } else {
1736       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1737         vpbroadcastd(dst, src, vlen_enc);
1738       } else {
1739         vbroadcastss(dst, src, vlen_enc);
1740       }
1741     }
1742   } else if (VM_Version::supports_sse3()) {
1743     movddup(dst, src);
1744   } else {
1745     load_vector(bt, dst, src, vlen);
1746   }
1747 }
1748 
1749 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1750   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1751   int offset = exact_log2(type2aelembytes(bt)) << 6;
1752   if (is_floating_point_type(bt)) {
1753     offset += 128;
1754   }
1755   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1756   load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1757 }
1758 
1759 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1760 
1761 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1762   int vector_len = Assembler::AVX_128bit;
1763 
1764   switch (opcode) {
1765     case Op_AndReductionV:  pand(dst, src); break;
1766     case Op_OrReductionV:   por (dst, src); break;
1767     case Op_XorReductionV:  pxor(dst, src); break;
1768     case Op_MinReductionV:
1769       switch (typ) {
1770         case T_BYTE:        pminsb(dst, src); break;
1771         case T_SHORT:       pminsw(dst, src); break;
1772         case T_INT:         pminsd(dst, src); break;
1773         case T_LONG:        assert(UseAVX > 2, "required");
1774                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1775         default:            assert(false, "wrong type");
1776       }
1777       break;
1778     case Op_MaxReductionV:
1779       switch (typ) {
1780         case T_BYTE:        pmaxsb(dst, src); break;
1781         case T_SHORT:       pmaxsw(dst, src); break;
1782         case T_INT:         pmaxsd(dst, src); break;
1783         case T_LONG:        assert(UseAVX > 2, "required");
1784                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1785         default:            assert(false, "wrong type");
1786       }
1787       break;
1788     case Op_UMinReductionV:
1789       switch (typ) {
1790         case T_BYTE:        vpminub(dst, dst, src, Assembler::AVX_128bit); break;
1791         case T_SHORT:       vpminuw(dst, dst, src, Assembler::AVX_128bit); break;
1792         case T_INT:         vpminud(dst, dst, src, Assembler::AVX_128bit); break;
1793         case T_LONG:        evpminuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break;
1794         default:            assert(false, "wrong type");
1795       }
1796       break;
1797     case Op_UMaxReductionV:
1798       switch (typ) {
1799         case T_BYTE:        vpmaxub(dst, dst, src, Assembler::AVX_128bit); break;
1800         case T_SHORT:       vpmaxuw(dst, dst, src, Assembler::AVX_128bit); break;
1801         case T_INT:         vpmaxud(dst, dst, src, Assembler::AVX_128bit); break;
1802         case T_LONG:        evpmaxuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break;
1803         default:            assert(false, "wrong type");
1804       }
1805       break;
1806     case Op_AddReductionVF: addss(dst, src); break;
1807     case Op_AddReductionVD: addsd(dst, src); break;
1808     case Op_AddReductionVI:
1809       switch (typ) {
1810         case T_BYTE:        paddb(dst, src); break;
1811         case T_SHORT:       paddw(dst, src); break;
1812         case T_INT:         paddd(dst, src); break;
1813         default:            assert(false, "wrong type");
1814       }
1815       break;
1816     case Op_AddReductionVL: paddq(dst, src); break;
1817     case Op_MulReductionVF: mulss(dst, src); break;
1818     case Op_MulReductionVD: mulsd(dst, src); break;
1819     case Op_MulReductionVI:
1820       switch (typ) {
1821         case T_SHORT:       pmullw(dst, src); break;
1822         case T_INT:         pmulld(dst, src); break;
1823         default:            assert(false, "wrong type");
1824       }
1825       break;
1826     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1827                             evpmullq(dst, dst, src, vector_len); break;
1828     default:                assert(false, "wrong opcode");
1829   }
1830 }
1831 
1832 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1833   switch (opcode) {
1834     case Op_AddReductionVF: addps(dst, src); break;
1835     case Op_AddReductionVD: addpd(dst, src); break;
1836     case Op_MulReductionVF: mulps(dst, src); break;
1837     case Op_MulReductionVD: mulpd(dst, src); break;
1838     default:                assert(false, "%s", NodeClassNames[opcode]);
1839   }
1840 }
1841 
1842 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1843   int vector_len = Assembler::AVX_256bit;
1844 
1845   switch (opcode) {
1846     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1847     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1848     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1849     case Op_MinReductionV:
1850       switch (typ) {
1851         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1852         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1853         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1854         case T_LONG:        assert(UseAVX > 2, "required");
1855                             vpminsq(dst, src1, src2, vector_len); break;
1856         default:            assert(false, "wrong type");
1857       }
1858       break;
1859     case Op_MaxReductionV:
1860       switch (typ) {
1861         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1862         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1863         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1864         case T_LONG:        assert(UseAVX > 2, "required");
1865                             vpmaxsq(dst, src1, src2, vector_len); break;
1866         default:            assert(false, "wrong type");
1867       }
1868       break;
1869     case Op_UMinReductionV:
1870       switch (typ) {
1871         case T_BYTE:        vpminub(dst, src1, src2, vector_len); break;
1872         case T_SHORT:       vpminuw(dst, src1, src2, vector_len); break;
1873         case T_INT:         vpminud(dst, src1, src2, vector_len); break;
1874         case T_LONG:        evpminuq(dst, k0, src1, src2, true, vector_len); break;
1875         default:            assert(false, "wrong type");
1876       }
1877       break;
1878     case Op_UMaxReductionV:
1879       switch (typ) {
1880         case T_BYTE:        vpmaxub(dst, src1, src2, vector_len); break;
1881         case T_SHORT:       vpmaxuw(dst, src1, src2, vector_len); break;
1882         case T_INT:         vpmaxud(dst, src1, src2, vector_len); break;
1883         case T_LONG:        evpmaxuq(dst, k0, src1, src2, true, vector_len); break;
1884         default:            assert(false, "wrong type");
1885       }
1886       break;
1887     case Op_AddReductionVI:
1888       switch (typ) {
1889         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1890         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1891         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1892         default:            assert(false, "wrong type");
1893       }
1894       break;
1895     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1896     case Op_MulReductionVI:
1897       switch (typ) {
1898         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1899         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1900         default:            assert(false, "wrong type");
1901       }
1902       break;
1903     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1904     default:                assert(false, "wrong opcode");
1905   }
1906 }
1907 
1908 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1909   int vector_len = Assembler::AVX_256bit;
1910 
1911   switch (opcode) {
1912     case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1913     case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1914     case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1915     case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1916     default:                assert(false, "%s", NodeClassNames[opcode]);
1917   }
1918 }
1919 
1920 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1921                                   XMMRegister dst, XMMRegister src,
1922                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1923   switch (opcode) {
1924     case Op_AddReductionVF:
1925     case Op_MulReductionVF:
1926       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1927       break;
1928 
1929     case Op_AddReductionVD:
1930     case Op_MulReductionVD:
1931       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1932       break;
1933 
1934     default: assert(false, "wrong opcode");
1935   }
1936 }
1937 
1938 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1939                                             XMMRegister dst, XMMRegister src,
1940                                             XMMRegister vtmp1, XMMRegister vtmp2) {
1941   switch (opcode) {
1942     case Op_AddReductionVF:
1943     case Op_MulReductionVF:
1944       unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1945       break;
1946 
1947     case Op_AddReductionVD:
1948     case Op_MulReductionVD:
1949       unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1950       break;
1951 
1952     default: assert(false, "%s", NodeClassNames[opcode]);
1953   }
1954 }
1955 
1956 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1957                              Register dst, Register src1, XMMRegister src2,
1958                              XMMRegister vtmp1, XMMRegister vtmp2) {
1959   switch (vlen) {
1960     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1961     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1962     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1963     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1964 
1965     default: assert(false, "wrong vector length");
1966   }
1967 }
1968 
1969 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1970                              Register dst, Register src1, XMMRegister src2,
1971                              XMMRegister vtmp1, XMMRegister vtmp2) {
1972   switch (vlen) {
1973     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1974     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1975     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1976     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1977 
1978     default: assert(false, "wrong vector length");
1979   }
1980 }
1981 
1982 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1983                              Register dst, Register src1, XMMRegister src2,
1984                              XMMRegister vtmp1, XMMRegister vtmp2) {
1985   switch (vlen) {
1986     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1987     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1988     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1989     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1990 
1991     default: assert(false, "wrong vector length");
1992   }
1993 }
1994 
1995 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1996                              Register dst, Register src1, XMMRegister src2,
1997                              XMMRegister vtmp1, XMMRegister vtmp2) {
1998   switch (vlen) {
1999     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2000     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2001     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2002     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2003 
2004     default: assert(false, "wrong vector length");
2005   }
2006 }
2007 
2008 void C2_MacroAssembler::reduceL(int opcode, int vlen,
2009                              Register dst, Register src1, XMMRegister src2,
2010                              XMMRegister vtmp1, XMMRegister vtmp2) {
2011   switch (vlen) {
2012     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2013     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2014     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2015 
2016     default: assert(false, "wrong vector length");
2017   }
2018 }
2019 
2020 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2021   switch (vlen) {
2022     case 2:
2023       assert(vtmp2 == xnoreg, "");
2024       reduce2F(opcode, dst, src, vtmp1);
2025       break;
2026     case 4:
2027       assert(vtmp2 == xnoreg, "");
2028       reduce4F(opcode, dst, src, vtmp1);
2029       break;
2030     case 8:
2031       reduce8F(opcode, dst, src, vtmp1, vtmp2);
2032       break;
2033     case 16:
2034       reduce16F(opcode, dst, src, vtmp1, vtmp2);
2035       break;
2036     default: assert(false, "wrong vector length");
2037   }
2038 }
2039 
2040 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2041   switch (vlen) {
2042     case 2:
2043       assert(vtmp2 == xnoreg, "");
2044       reduce2D(opcode, dst, src, vtmp1);
2045       break;
2046     case 4:
2047       reduce4D(opcode, dst, src, vtmp1, vtmp2);
2048       break;
2049     case 8:
2050       reduce8D(opcode, dst, src, vtmp1, vtmp2);
2051       break;
2052     default: assert(false, "wrong vector length");
2053   }
2054 }
2055 
2056 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2057   switch (vlen) {
2058     case 2:
2059       assert(vtmp1 == xnoreg, "");
2060       assert(vtmp2 == xnoreg, "");
2061       unorderedReduce2F(opcode, dst, src);
2062       break;
2063     case 4:
2064       assert(vtmp2 == xnoreg, "");
2065       unorderedReduce4F(opcode, dst, src, vtmp1);
2066       break;
2067     case 8:
2068       unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
2069       break;
2070     case 16:
2071       unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
2072       break;
2073     default: assert(false, "wrong vector length");
2074   }
2075 }
2076 
2077 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2078   switch (vlen) {
2079     case 2:
2080       assert(vtmp1 == xnoreg, "");
2081       assert(vtmp2 == xnoreg, "");
2082       unorderedReduce2D(opcode, dst, src);
2083       break;
2084     case 4:
2085       assert(vtmp2 == xnoreg, "");
2086       unorderedReduce4D(opcode, dst, src, vtmp1);
2087       break;
2088     case 8:
2089       unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2090       break;
2091     default: assert(false, "wrong vector length");
2092   }
2093 }
2094 
2095 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2096   if (opcode == Op_AddReductionVI) {
2097     if (vtmp1 != src2) {
2098       movdqu(vtmp1, src2);
2099     }
2100     phaddd(vtmp1, vtmp1);
2101   } else {
2102     pshufd(vtmp1, src2, 0x1);
2103     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2104   }
2105   movdl(vtmp2, src1);
2106   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2107   movdl(dst, vtmp1);
2108 }
2109 
2110 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2111   if (opcode == Op_AddReductionVI) {
2112     if (vtmp1 != src2) {
2113       movdqu(vtmp1, src2);
2114     }
2115     phaddd(vtmp1, src2);
2116     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2117   } else {
2118     pshufd(vtmp2, src2, 0xE);
2119     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2120     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2121   }
2122 }
2123 
2124 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2125   if (opcode == Op_AddReductionVI) {
2126     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2127     vextracti128_high(vtmp2, vtmp1);
2128     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2129     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2130   } else {
2131     vextracti128_high(vtmp1, src2);
2132     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2133     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2134   }
2135 }
2136 
2137 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2138   vextracti64x4_high(vtmp2, src2);
2139   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2140   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2141 }
2142 
2143 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2144   pshufd(vtmp2, src2, 0x1);
2145   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2146   movdqu(vtmp1, vtmp2);
2147   psrldq(vtmp1, 2);
2148   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2149   movdqu(vtmp2, vtmp1);
2150   psrldq(vtmp2, 1);
2151   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2152   movdl(vtmp2, src1);
2153   if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) {
2154     pmovzxbd(vtmp1, vtmp1);
2155   } else {
2156     pmovsxbd(vtmp1, vtmp1);
2157   }
2158   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2159   pextrb(dst, vtmp1, 0x0);
2160   movsbl(dst, dst);
2161 }
2162 
2163 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2164   pshufd(vtmp1, src2, 0xE);
2165   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2166   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2167 }
2168 
2169 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2170   vextracti128_high(vtmp2, src2);
2171   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2172   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2173 }
2174 
2175 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2176   vextracti64x4_high(vtmp1, src2);
2177   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2178   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2179 }
2180 
2181 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2182   pmovsxbw(vtmp2, src2);
2183   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2184 }
2185 
2186 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2187   if (UseAVX > 1) {
2188     int vector_len = Assembler::AVX_256bit;
2189     vpmovsxbw(vtmp1, src2, vector_len);
2190     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2191   } else {
2192     pmovsxbw(vtmp2, src2);
2193     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2194     pshufd(vtmp2, src2, 0xe);
2195     pmovsxbw(vtmp2, vtmp2);
2196     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2197   }
2198 }
2199 
2200 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2201   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2202     int vector_len = Assembler::AVX_512bit;
2203     vpmovsxbw(vtmp1, src2, vector_len);
2204     reduce32S(opcode, dst, src1, vtmp1, vtmp2, vtmp1);
2205   } else {
2206     assert(UseAVX >= 2,"Should not reach here.");
2207     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2208     vextracti128_high(vtmp2, src2);
2209     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2210   }
2211 }
2212 
2213 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2214   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2215   vextracti64x4_high(vtmp2, src2);
2216   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2217 }
2218 
2219 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2220   if (opcode == Op_AddReductionVI) {
2221     if (vtmp1 != src2) {
2222       movdqu(vtmp1, src2);
2223     }
2224     phaddw(vtmp1, vtmp1);
2225     phaddw(vtmp1, vtmp1);
2226   } else {
2227     pshufd(vtmp2, src2, 0x1);
2228     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2229     movdqu(vtmp1, vtmp2);
2230     psrldq(vtmp1, 2);
2231     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2232   }
2233   movdl(vtmp2, src1);
2234   if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) {
2235     pmovzxwd(vtmp1, vtmp1);
2236   } else {
2237     pmovsxwd(vtmp1, vtmp1);
2238   }
2239   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2240   pextrw(dst, vtmp1, 0x0);
2241   movswl(dst, dst);
2242 }
2243 
2244 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2245   if (opcode == Op_AddReductionVI) {
2246     if (vtmp1 != src2) {
2247       movdqu(vtmp1, src2);
2248     }
2249     phaddw(vtmp1, src2);
2250   } else {
2251     assert_different_registers(src2, vtmp1);
2252     pshufd(vtmp1, src2, 0xE);
2253     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2254   }
2255   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2256 }
2257 
2258 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2259   if (opcode == Op_AddReductionVI) {
2260     int vector_len = Assembler::AVX_256bit;
2261     vphaddw(vtmp2, src2, src2, vector_len);
2262     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2263   } else {
2264     assert_different_registers(src2, vtmp2);
2265     vextracti128_high(vtmp2, src2);
2266     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2267   }
2268   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2269 }
2270 
2271 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2272   assert_different_registers(src2, vtmp1);
2273   int vector_len = Assembler::AVX_256bit;
2274   vextracti64x4_high(vtmp1, src2);
2275   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2276   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2277 }
2278 
2279 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2280   pshufd(vtmp2, src2, 0xE);
2281   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2282   movdq(vtmp1, src1);
2283   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2284   movdq(dst, vtmp1);
2285 }
2286 
2287 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2288   vextracti128_high(vtmp1, src2);
2289   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2290   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2291 }
2292 
2293 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2294   vextracti64x4_high(vtmp2, src2);
2295   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2296   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2297 }
2298 
2299 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2300   mov64(temp, -1L);
2301   bzhiq(temp, temp, len);
2302   kmovql(dst, temp);
2303 }
2304 
2305 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2306   reduce_operation_128(T_FLOAT, opcode, dst, src);
2307   pshufd(vtmp, src, 0x1);
2308   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2309 }
2310 
2311 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2312   reduce2F(opcode, dst, src, vtmp);
2313   pshufd(vtmp, src, 0x2);
2314   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2315   pshufd(vtmp, src, 0x3);
2316   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2317 }
2318 
2319 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2320   reduce4F(opcode, dst, src, vtmp2);
2321   vextractf128_high(vtmp2, src);
2322   reduce4F(opcode, dst, vtmp2, vtmp1);
2323 }
2324 
2325 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2326   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2327   vextracti64x4_high(vtmp1, src);
2328   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2329 }
2330 
2331 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2332   pshufd(dst, src, 0x1);
2333   reduce_operation_128(T_FLOAT, opcode, dst, src);
2334 }
2335 
2336 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2337   pshufd(vtmp, src, 0xE);
2338   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2339   unorderedReduce2F(opcode, dst, vtmp);
2340 }
2341 
2342 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2343   vextractf128_high(vtmp1, src);
2344   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2345   unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2346 }
2347 
2348 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2349   vextractf64x4_high(vtmp2, src);
2350   unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2351   unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2352 }
2353 
2354 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2355   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2356   pshufd(vtmp, src, 0xE);
2357   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2358 }
2359 
2360 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2361   reduce2D(opcode, dst, src, vtmp2);
2362   vextractf128_high(vtmp2, src);
2363   reduce2D(opcode, dst, vtmp2, vtmp1);
2364 }
2365 
2366 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2367   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2368   vextracti64x4_high(vtmp1, src);
2369   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2370 }
2371 
2372 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2373   pshufd(dst, src, 0xE);
2374   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2375 }
2376 
2377 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2378   vextractf128_high(vtmp, src);
2379   unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2380   unorderedReduce2D(opcode, dst, vtmp);
2381 }
2382 
2383 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2384   vextractf64x4_high(vtmp2, src);
2385   unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2386   unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2387 }
2388 
2389 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2390   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2391 }
2392 
2393 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2394   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2395 }
2396 
2397 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2398   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2399 }
2400 
2401 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2402                                  int vec_enc) {
2403   switch(elem_bt) {
2404     case T_INT:
2405     case T_FLOAT:
2406       vmaskmovps(dst, src, mask, vec_enc);
2407       break;
2408     case T_LONG:
2409     case T_DOUBLE:
2410       vmaskmovpd(dst, src, mask, vec_enc);
2411       break;
2412     default:
2413       fatal("Unsupported type %s", type2name(elem_bt));
2414       break;
2415   }
2416 }
2417 
2418 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2419                                  int vec_enc) {
2420   switch(elem_bt) {
2421     case T_INT:
2422     case T_FLOAT:
2423       vmaskmovps(dst, src, mask, vec_enc);
2424       break;
2425     case T_LONG:
2426     case T_DOUBLE:
2427       vmaskmovpd(dst, src, mask, vec_enc);
2428       break;
2429     default:
2430       fatal("Unsupported type %s", type2name(elem_bt));
2431       break;
2432   }
2433 }
2434 
2435 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2436                                           XMMRegister dst, XMMRegister src,
2437                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2438                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2439   const int permconst[] = {1, 14};
2440   XMMRegister wsrc = src;
2441   XMMRegister wdst = xmm_0;
2442   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2443 
2444   int vlen_enc = Assembler::AVX_128bit;
2445   if (vlen == 16) {
2446     vlen_enc = Assembler::AVX_256bit;
2447   }
2448 
2449   for (int i = log2(vlen) - 1; i >=0; i--) {
2450     if (i == 0 && !is_dst_valid) {
2451       wdst = dst;
2452     }
2453     if (i == 3) {
2454       vextracti64x4_high(wtmp, wsrc);
2455     } else if (i == 2) {
2456       vextracti128_high(wtmp, wsrc);
2457     } else { // i = [0,1]
2458       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2459     }
2460 
2461     if (VM_Version::supports_avx10_2()) {
2462       vminmax_fp_avx10_2(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc);
2463     } else {
2464       vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2465     }
2466     wsrc = wdst;
2467     vlen_enc = Assembler::AVX_128bit;
2468   }
2469   if (is_dst_valid) {
2470     if (VM_Version::supports_avx10_2()) {
2471       vminmax_fp_avx10_2(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit);
2472     } else {
2473       vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2474     }
2475   }
2476 }
2477 
2478 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2479                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2480                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2481   XMMRegister wsrc = src;
2482   XMMRegister wdst = xmm_0;
2483   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2484   int vlen_enc = Assembler::AVX_128bit;
2485   if (vlen == 8) {
2486     vlen_enc = Assembler::AVX_256bit;
2487   }
2488   for (int i = log2(vlen) - 1; i >=0; i--) {
2489     if (i == 0 && !is_dst_valid) {
2490       wdst = dst;
2491     }
2492     if (i == 1) {
2493       vextracti128_high(wtmp, wsrc);
2494     } else if (i == 2) {
2495       vextracti64x4_high(wtmp, wsrc);
2496     } else {
2497       assert(i == 0, "%d", i);
2498       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2499     }
2500 
2501     if (VM_Version::supports_avx10_2()) {
2502       vminmax_fp_avx10_2(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc);
2503     } else {
2504       vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2505     }
2506 
2507     wsrc = wdst;
2508     vlen_enc = Assembler::AVX_128bit;
2509   }
2510 
2511   if (is_dst_valid) {
2512     if (VM_Version::supports_avx10_2()) {
2513       vminmax_fp_avx10_2(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit);
2514     } else {
2515       vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2516     }
2517   }
2518 }
2519 
2520 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2521   switch (bt) {
2522     case T_BYTE:  pextrb(dst, src, idx); break;
2523     case T_SHORT: pextrw(dst, src, idx); break;
2524     case T_INT:   pextrd(dst, src, idx); break;
2525     case T_LONG:  pextrq(dst, src, idx); break;
2526 
2527     default:
2528       assert(false,"Should not reach here.");
2529       break;
2530   }
2531 }
2532 
2533 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2534   int esize =  type2aelembytes(typ);
2535   int elem_per_lane = 16/esize;
2536   int lane = elemindex / elem_per_lane;
2537   int eindex = elemindex % elem_per_lane;
2538 
2539   if (lane >= 2) {
2540     assert(UseAVX > 2, "required");
2541     vextractf32x4(dst, src, lane & 3);
2542     return dst;
2543   } else if (lane > 0) {
2544     assert(UseAVX > 0, "required");
2545     vextractf128(dst, src, lane);
2546     return dst;
2547   } else {
2548     return src;
2549   }
2550 }
2551 
2552 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2553   if (typ == T_BYTE) {
2554     movsbl(dst, dst);
2555   } else if (typ == T_SHORT) {
2556     movswl(dst, dst);
2557   }
2558 }
2559 
2560 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2561   int esize =  type2aelembytes(typ);
2562   int elem_per_lane = 16/esize;
2563   int eindex = elemindex % elem_per_lane;
2564   assert(is_integral_type(typ),"required");
2565 
2566   if (eindex == 0) {
2567     if (typ == T_LONG) {
2568       movq(dst, src);
2569     } else {
2570       movdl(dst, src);
2571       movsxl(typ, dst);
2572     }
2573   } else {
2574     extract(typ, dst, src, eindex);
2575     movsxl(typ, dst);
2576   }
2577 }
2578 
2579 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2580   int esize =  type2aelembytes(typ);
2581   int elem_per_lane = 16/esize;
2582   int eindex = elemindex % elem_per_lane;
2583   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2584 
2585   if (eindex == 0) {
2586     movq(dst, src);
2587   } else {
2588     if (typ == T_FLOAT) {
2589       if (UseAVX == 0) {
2590         movdqu(dst, src);
2591         shufps(dst, dst, eindex);
2592       } else {
2593         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2594       }
2595     } else {
2596       if (UseAVX == 0) {
2597         movdqu(dst, src);
2598         psrldq(dst, eindex*esize);
2599       } else {
2600         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2601       }
2602       movq(dst, dst);
2603     }
2604   }
2605   // Zero upper bits
2606   if (typ == T_FLOAT) {
2607     if (UseAVX == 0) {
2608       assert(vtmp != xnoreg, "required.");
2609       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2610       pand(dst, vtmp);
2611     } else {
2612       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2613     }
2614   }
2615 }
2616 
2617 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2618   switch(typ) {
2619     case T_BYTE:
2620     case T_BOOLEAN:
2621       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2622       break;
2623     case T_SHORT:
2624     case T_CHAR:
2625       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2626       break;
2627     case T_INT:
2628     case T_FLOAT:
2629       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2630       break;
2631     case T_LONG:
2632     case T_DOUBLE:
2633       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2634       break;
2635     default:
2636       assert(false,"Should not reach here.");
2637       break;
2638   }
2639 }
2640 
2641 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2642   assert(rscratch != noreg || always_reachable(src2), "missing");
2643 
2644   switch(typ) {
2645     case T_BOOLEAN:
2646     case T_BYTE:
2647       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2648       break;
2649     case T_CHAR:
2650     case T_SHORT:
2651       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2652       break;
2653     case T_INT:
2654     case T_FLOAT:
2655       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2656       break;
2657     case T_LONG:
2658     case T_DOUBLE:
2659       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2660       break;
2661     default:
2662       assert(false,"Should not reach here.");
2663       break;
2664   }
2665 }
2666 
2667 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2668   switch(typ) {
2669     case T_BYTE:
2670       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2671       break;
2672     case T_SHORT:
2673       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2674       break;
2675     case T_INT:
2676     case T_FLOAT:
2677       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2678       break;
2679     case T_LONG:
2680     case T_DOUBLE:
2681       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2682       break;
2683     default:
2684       assert(false,"Should not reach here.");
2685       break;
2686   }
2687 }
2688 
2689 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2690   assert(vlen_in_bytes <= 32, "");
2691   int esize = type2aelembytes(bt);
2692   if (vlen_in_bytes == 32) {
2693     assert(vtmp == xnoreg, "required.");
2694     if (esize >= 4) {
2695       vtestps(src1, src2, AVX_256bit);
2696     } else {
2697       vptest(src1, src2, AVX_256bit);
2698     }
2699     return;
2700   }
2701   if (vlen_in_bytes < 16) {
2702     // Duplicate the lower part to fill the whole register,
2703     // Don't need to do so for src2
2704     assert(vtmp != xnoreg, "required");
2705     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2706     pshufd(vtmp, src1, shuffle_imm);
2707   } else {
2708     assert(vtmp == xnoreg, "required");
2709     vtmp = src1;
2710   }
2711   if (esize >= 4 && VM_Version::supports_avx()) {
2712     vtestps(vtmp, src2, AVX_128bit);
2713   } else {
2714     ptest(vtmp, src2);
2715   }
2716 }
2717 
2718 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2719 #ifdef ASSERT
2720   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2721   bool is_bw_supported = VM_Version::supports_avx512bw();
2722   if (is_bw && !is_bw_supported) {
2723     assert(vlen_enc != Assembler::AVX_512bit, "required");
2724     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2725            "XMM register should be 0-15");
2726   }
2727 #endif // ASSERT
2728   switch (elem_bt) {
2729     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2730     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2731     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2732     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2733     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2734     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2735     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2736   }
2737 }
2738 
2739 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2740   assert(UseAVX >= 2, "required");
2741   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2742   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2743   if ((UseAVX > 2) &&
2744       (!is_bw || VM_Version::supports_avx512bw()) &&
2745       (!is_vl || VM_Version::supports_avx512vl())) {
2746     switch (elem_bt) {
2747       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2748       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2749       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2750       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2751       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2752     }
2753   } else {
2754     assert(vlen_enc != Assembler::AVX_512bit, "required");
2755     assert((dst->encoding() < 16),"XMM register should be 0-15");
2756     switch (elem_bt) {
2757       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2758       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2759       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2760       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2761       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2762       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2763       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2764     }
2765   }
2766 }
2767 
2768 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2769   switch (to_elem_bt) {
2770     case T_SHORT:
2771       vpmovsxbw(dst, src, vlen_enc);
2772       break;
2773     case T_INT:
2774       vpmovsxbd(dst, src, vlen_enc);
2775       break;
2776     case T_FLOAT:
2777       vpmovsxbd(dst, src, vlen_enc);
2778       vcvtdq2ps(dst, dst, vlen_enc);
2779       break;
2780     case T_LONG:
2781       vpmovsxbq(dst, src, vlen_enc);
2782       break;
2783     case T_DOUBLE: {
2784       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2785       vpmovsxbd(dst, src, mid_vlen_enc);
2786       vcvtdq2pd(dst, dst, vlen_enc);
2787       break;
2788     }
2789     default:
2790       fatal("Unsupported type %s", type2name(to_elem_bt));
2791       break;
2792   }
2793 }
2794 
2795 //-------------------------------------------------------------------------------------------
2796 
2797 // IndexOf for constant substrings with size >= 8 chars
2798 // which don't need to be loaded through stack.
2799 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2800                                          Register cnt1, Register cnt2,
2801                                          int int_cnt2,  Register result,
2802                                          XMMRegister vec, Register tmp,
2803                                          int ae) {
2804   ShortBranchVerifier sbv(this);
2805   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2806   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2807 
2808   // This method uses the pcmpestri instruction with bound registers
2809   //   inputs:
2810   //     xmm - substring
2811   //     rax - substring length (elements count)
2812   //     mem - scanned string
2813   //     rdx - string length (elements count)
2814   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2815   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2816   //   outputs:
2817   //     rcx - matched index in string
2818   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2819   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2820   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2821   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2822   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2823 
2824   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2825         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2826         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2827 
2828   // Note, inline_string_indexOf() generates checks:
2829   // if (substr.count > string.count) return -1;
2830   // if (substr.count == 0) return 0;
2831   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2832 
2833   // Load substring.
2834   if (ae == StrIntrinsicNode::UL) {
2835     pmovzxbw(vec, Address(str2, 0));
2836   } else {
2837     movdqu(vec, Address(str2, 0));
2838   }
2839   movl(cnt2, int_cnt2);
2840   movptr(result, str1); // string addr
2841 
2842   if (int_cnt2 > stride) {
2843     jmpb(SCAN_TO_SUBSTR);
2844 
2845     // Reload substr for rescan, this code
2846     // is executed only for large substrings (> 8 chars)
2847     bind(RELOAD_SUBSTR);
2848     if (ae == StrIntrinsicNode::UL) {
2849       pmovzxbw(vec, Address(str2, 0));
2850     } else {
2851       movdqu(vec, Address(str2, 0));
2852     }
2853     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2854 
2855     bind(RELOAD_STR);
2856     // We came here after the beginning of the substring was
2857     // matched but the rest of it was not so we need to search
2858     // again. Start from the next element after the previous match.
2859 
2860     // cnt2 is number of substring reminding elements and
2861     // cnt1 is number of string reminding elements when cmp failed.
2862     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2863     subl(cnt1, cnt2);
2864     addl(cnt1, int_cnt2);
2865     movl(cnt2, int_cnt2); // Now restore cnt2
2866 
2867     decrementl(cnt1);     // Shift to next element
2868     cmpl(cnt1, cnt2);
2869     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2870 
2871     addptr(result, (1<<scale1));
2872 
2873   } // (int_cnt2 > 8)
2874 
2875   // Scan string for start of substr in 16-byte vectors
2876   bind(SCAN_TO_SUBSTR);
2877   pcmpestri(vec, Address(result, 0), mode);
2878   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2879   subl(cnt1, stride);
2880   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2881   cmpl(cnt1, cnt2);
2882   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2883   addptr(result, 16);
2884   jmpb(SCAN_TO_SUBSTR);
2885 
2886   // Found a potential substr
2887   bind(FOUND_CANDIDATE);
2888   // Matched whole vector if first element matched (tmp(rcx) == 0).
2889   if (int_cnt2 == stride) {
2890     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2891   } else { // int_cnt2 > 8
2892     jccb(Assembler::overflow, FOUND_SUBSTR);
2893   }
2894   // After pcmpestri tmp(rcx) contains matched element index
2895   // Compute start addr of substr
2896   lea(result, Address(result, tmp, scale1));
2897 
2898   // Make sure string is still long enough
2899   subl(cnt1, tmp);
2900   cmpl(cnt1, cnt2);
2901   if (int_cnt2 == stride) {
2902     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2903   } else { // int_cnt2 > 8
2904     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2905   }
2906   // Left less then substring.
2907 
2908   bind(RET_NOT_FOUND);
2909   movl(result, -1);
2910   jmp(EXIT);
2911 
2912   if (int_cnt2 > stride) {
2913     // This code is optimized for the case when whole substring
2914     // is matched if its head is matched.
2915     bind(MATCH_SUBSTR_HEAD);
2916     pcmpestri(vec, Address(result, 0), mode);
2917     // Reload only string if does not match
2918     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2919 
2920     Label CONT_SCAN_SUBSTR;
2921     // Compare the rest of substring (> 8 chars).
2922     bind(FOUND_SUBSTR);
2923     // First 8 chars are already matched.
2924     negptr(cnt2);
2925     addptr(cnt2, stride);
2926 
2927     bind(SCAN_SUBSTR);
2928     subl(cnt1, stride);
2929     cmpl(cnt2, -stride); // Do not read beyond substring
2930     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2931     // Back-up strings to avoid reading beyond substring:
2932     // cnt1 = cnt1 - cnt2 + 8
2933     addl(cnt1, cnt2); // cnt2 is negative
2934     addl(cnt1, stride);
2935     movl(cnt2, stride); negptr(cnt2);
2936     bind(CONT_SCAN_SUBSTR);
2937     if (int_cnt2 < (int)G) {
2938       int tail_off1 = int_cnt2<<scale1;
2939       int tail_off2 = int_cnt2<<scale2;
2940       if (ae == StrIntrinsicNode::UL) {
2941         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2942       } else {
2943         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2944       }
2945       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2946     } else {
2947       // calculate index in register to avoid integer overflow (int_cnt2*2)
2948       movl(tmp, int_cnt2);
2949       addptr(tmp, cnt2);
2950       if (ae == StrIntrinsicNode::UL) {
2951         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2952       } else {
2953         movdqu(vec, Address(str2, tmp, scale2, 0));
2954       }
2955       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2956     }
2957     // Need to reload strings pointers if not matched whole vector
2958     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2959     addptr(cnt2, stride);
2960     jcc(Assembler::negative, SCAN_SUBSTR);
2961     // Fall through if found full substring
2962 
2963   } // (int_cnt2 > 8)
2964 
2965   bind(RET_FOUND);
2966   // Found result if we matched full small substring.
2967   // Compute substr offset
2968   subptr(result, str1);
2969   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2970     shrl(result, 1); // index
2971   }
2972   bind(EXIT);
2973 
2974 } // string_indexofC8
2975 
2976 // Small strings are loaded through stack if they cross page boundary.
2977 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2978                                        Register cnt1, Register cnt2,
2979                                        int int_cnt2,  Register result,
2980                                        XMMRegister vec, Register tmp,
2981                                        int ae) {
2982   ShortBranchVerifier sbv(this);
2983   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2984   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2985 
2986   //
2987   // int_cnt2 is length of small (< 8 chars) constant substring
2988   // or (-1) for non constant substring in which case its length
2989   // is in cnt2 register.
2990   //
2991   // Note, inline_string_indexOf() generates checks:
2992   // if (substr.count > string.count) return -1;
2993   // if (substr.count == 0) return 0;
2994   //
2995   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2996   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2997   // This method uses the pcmpestri instruction with bound registers
2998   //   inputs:
2999   //     xmm - substring
3000   //     rax - substring length (elements count)
3001   //     mem - scanned string
3002   //     rdx - string length (elements count)
3003   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
3004   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
3005   //   outputs:
3006   //     rcx - matched index in string
3007   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3008   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
3009   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
3010   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
3011 
3012   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
3013         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
3014         FOUND_CANDIDATE;
3015 
3016   { //========================================================
3017     // We don't know where these strings are located
3018     // and we can't read beyond them. Load them through stack.
3019     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
3020 
3021     movptr(tmp, rsp); // save old SP
3022 
3023     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
3024       if (int_cnt2 == (1>>scale2)) { // One byte
3025         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
3026         load_unsigned_byte(result, Address(str2, 0));
3027         movdl(vec, result); // move 32 bits
3028       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
3029         // Not enough header space in 32-bit VM: 12+3 = 15.
3030         movl(result, Address(str2, -1));
3031         shrl(result, 8);
3032         movdl(vec, result); // move 32 bits
3033       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
3034         load_unsigned_short(result, Address(str2, 0));
3035         movdl(vec, result); // move 32 bits
3036       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
3037         movdl(vec, Address(str2, 0)); // move 32 bits
3038       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
3039         movq(vec, Address(str2, 0));  // move 64 bits
3040       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
3041         // Array header size is 12 bytes in 32-bit VM
3042         // + 6 bytes for 3 chars == 18 bytes,
3043         // enough space to load vec and shift.
3044         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
3045         if (ae == StrIntrinsicNode::UL) {
3046           int tail_off = int_cnt2-8;
3047           pmovzxbw(vec, Address(str2, tail_off));
3048           psrldq(vec, -2*tail_off);
3049         }
3050         else {
3051           int tail_off = int_cnt2*(1<<scale2);
3052           movdqu(vec, Address(str2, tail_off-16));
3053           psrldq(vec, 16-tail_off);
3054         }
3055       }
3056     } else { // not constant substring
3057       cmpl(cnt2, stride);
3058       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3059 
3060       // We can read beyond string if srt+16 does not cross page boundary
3061       // since heaps are aligned and mapped by pages.
3062       assert(os::vm_page_size() < (int)G, "default page should be small");
3063       movl(result, str2); // We need only low 32 bits
3064       andl(result, ((int)os::vm_page_size()-1));
3065       cmpl(result, ((int)os::vm_page_size()-16));
3066       jccb(Assembler::belowEqual, CHECK_STR);
3067 
3068       // Move small strings to stack to allow load 16 bytes into vec.
3069       subptr(rsp, 16);
3070       int stk_offset = wordSize-(1<<scale2);
3071       push(cnt2);
3072 
3073       bind(COPY_SUBSTR);
3074       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3075         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3076         movb(Address(rsp, cnt2, scale2, stk_offset), result);
3077       } else if (ae == StrIntrinsicNode::UU) {
3078         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3079         movw(Address(rsp, cnt2, scale2, stk_offset), result);
3080       }
3081       decrement(cnt2);
3082       jccb(Assembler::notZero, COPY_SUBSTR);
3083 
3084       pop(cnt2);
3085       movptr(str2, rsp);  // New substring address
3086     } // non constant
3087 
3088     bind(CHECK_STR);
3089     cmpl(cnt1, stride);
3090     jccb(Assembler::aboveEqual, BIG_STRINGS);
3091 
3092     // Check cross page boundary.
3093     movl(result, str1); // We need only low 32 bits
3094     andl(result, ((int)os::vm_page_size()-1));
3095     cmpl(result, ((int)os::vm_page_size()-16));
3096     jccb(Assembler::belowEqual, BIG_STRINGS);
3097 
3098     subptr(rsp, 16);
3099     int stk_offset = -(1<<scale1);
3100     if (int_cnt2 < 0) { // not constant
3101       push(cnt2);
3102       stk_offset += wordSize;
3103     }
3104     movl(cnt2, cnt1);
3105 
3106     bind(COPY_STR);
3107     if (ae == StrIntrinsicNode::LL) {
3108       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3109       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3110     } else {
3111       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3112       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3113     }
3114     decrement(cnt2);
3115     jccb(Assembler::notZero, COPY_STR);
3116 
3117     if (int_cnt2 < 0) { // not constant
3118       pop(cnt2);
3119     }
3120     movptr(str1, rsp);  // New string address
3121 
3122     bind(BIG_STRINGS);
3123     // Load substring.
3124     if (int_cnt2 < 0) { // -1
3125       if (ae == StrIntrinsicNode::UL) {
3126         pmovzxbw(vec, Address(str2, 0));
3127       } else {
3128         movdqu(vec, Address(str2, 0));
3129       }
3130       push(cnt2);       // substr count
3131       push(str2);       // substr addr
3132       push(str1);       // string addr
3133     } else {
3134       // Small (< 8 chars) constant substrings are loaded already.
3135       movl(cnt2, int_cnt2);
3136     }
3137     push(tmp);  // original SP
3138 
3139   } // Finished loading
3140 
3141   //========================================================
3142   // Start search
3143   //
3144 
3145   movptr(result, str1); // string addr
3146 
3147   if (int_cnt2  < 0) {  // Only for non constant substring
3148     jmpb(SCAN_TO_SUBSTR);
3149 
3150     // SP saved at sp+0
3151     // String saved at sp+1*wordSize
3152     // Substr saved at sp+2*wordSize
3153     // Substr count saved at sp+3*wordSize
3154 
3155     // Reload substr for rescan, this code
3156     // is executed only for large substrings (> 8 chars)
3157     bind(RELOAD_SUBSTR);
3158     movptr(str2, Address(rsp, 2*wordSize));
3159     movl(cnt2, Address(rsp, 3*wordSize));
3160     if (ae == StrIntrinsicNode::UL) {
3161       pmovzxbw(vec, Address(str2, 0));
3162     } else {
3163       movdqu(vec, Address(str2, 0));
3164     }
3165     // We came here after the beginning of the substring was
3166     // matched but the rest of it was not so we need to search
3167     // again. Start from the next element after the previous match.
3168     subptr(str1, result); // Restore counter
3169     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3170       shrl(str1, 1);
3171     }
3172     addl(cnt1, str1);
3173     decrementl(cnt1);   // Shift to next element
3174     cmpl(cnt1, cnt2);
3175     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3176 
3177     addptr(result, (1<<scale1));
3178   } // non constant
3179 
3180   // Scan string for start of substr in 16-byte vectors
3181   bind(SCAN_TO_SUBSTR);
3182   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3183   pcmpestri(vec, Address(result, 0), mode);
3184   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3185   subl(cnt1, stride);
3186   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3187   cmpl(cnt1, cnt2);
3188   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3189   addptr(result, 16);
3190 
3191   bind(ADJUST_STR);
3192   cmpl(cnt1, stride); // Do not read beyond string
3193   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3194   // Back-up string to avoid reading beyond string.
3195   lea(result, Address(result, cnt1, scale1, -16));
3196   movl(cnt1, stride);
3197   jmpb(SCAN_TO_SUBSTR);
3198 
3199   // Found a potential substr
3200   bind(FOUND_CANDIDATE);
3201   // After pcmpestri tmp(rcx) contains matched element index
3202 
3203   // Make sure string is still long enough
3204   subl(cnt1, tmp);
3205   cmpl(cnt1, cnt2);
3206   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3207   // Left less then substring.
3208 
3209   bind(RET_NOT_FOUND);
3210   movl(result, -1);
3211   jmp(CLEANUP);
3212 
3213   bind(FOUND_SUBSTR);
3214   // Compute start addr of substr
3215   lea(result, Address(result, tmp, scale1));
3216   if (int_cnt2 > 0) { // Constant substring
3217     // Repeat search for small substring (< 8 chars)
3218     // from new point without reloading substring.
3219     // Have to check that we don't read beyond string.
3220     cmpl(tmp, stride-int_cnt2);
3221     jccb(Assembler::greater, ADJUST_STR);
3222     // Fall through if matched whole substring.
3223   } else { // non constant
3224     assert(int_cnt2 == -1, "should be != 0");
3225 
3226     addl(tmp, cnt2);
3227     // Found result if we matched whole substring.
3228     cmpl(tmp, stride);
3229     jcc(Assembler::lessEqual, RET_FOUND);
3230 
3231     // Repeat search for small substring (<= 8 chars)
3232     // from new point 'str1' without reloading substring.
3233     cmpl(cnt2, stride);
3234     // Have to check that we don't read beyond string.
3235     jccb(Assembler::lessEqual, ADJUST_STR);
3236 
3237     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3238     // Compare the rest of substring (> 8 chars).
3239     movptr(str1, result);
3240 
3241     cmpl(tmp, cnt2);
3242     // First 8 chars are already matched.
3243     jccb(Assembler::equal, CHECK_NEXT);
3244 
3245     bind(SCAN_SUBSTR);
3246     pcmpestri(vec, Address(str1, 0), mode);
3247     // Need to reload strings pointers if not matched whole vector
3248     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3249 
3250     bind(CHECK_NEXT);
3251     subl(cnt2, stride);
3252     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3253     addptr(str1, 16);
3254     if (ae == StrIntrinsicNode::UL) {
3255       addptr(str2, 8);
3256     } else {
3257       addptr(str2, 16);
3258     }
3259     subl(cnt1, stride);
3260     cmpl(cnt2, stride); // Do not read beyond substring
3261     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3262     // Back-up strings to avoid reading beyond substring.
3263 
3264     if (ae == StrIntrinsicNode::UL) {
3265       lea(str2, Address(str2, cnt2, scale2, -8));
3266       lea(str1, Address(str1, cnt2, scale1, -16));
3267     } else {
3268       lea(str2, Address(str2, cnt2, scale2, -16));
3269       lea(str1, Address(str1, cnt2, scale1, -16));
3270     }
3271     subl(cnt1, cnt2);
3272     movl(cnt2, stride);
3273     addl(cnt1, stride);
3274     bind(CONT_SCAN_SUBSTR);
3275     if (ae == StrIntrinsicNode::UL) {
3276       pmovzxbw(vec, Address(str2, 0));
3277     } else {
3278       movdqu(vec, Address(str2, 0));
3279     }
3280     jmp(SCAN_SUBSTR);
3281 
3282     bind(RET_FOUND_LONG);
3283     movptr(str1, Address(rsp, wordSize));
3284   } // non constant
3285 
3286   bind(RET_FOUND);
3287   // Compute substr offset
3288   subptr(result, str1);
3289   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3290     shrl(result, 1); // index
3291   }
3292   bind(CLEANUP);
3293   pop(rsp); // restore SP
3294 
3295 } // string_indexof
3296 
3297 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3298                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3299   ShortBranchVerifier sbv(this);
3300   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3301 
3302   int stride = 8;
3303 
3304   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3305         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3306         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3307         FOUND_SEQ_CHAR, DONE_LABEL;
3308 
3309   movptr(result, str1);
3310   if (UseAVX >= 2) {
3311     cmpl(cnt1, stride);
3312     jcc(Assembler::less, SCAN_TO_CHAR);
3313     cmpl(cnt1, 2*stride);
3314     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3315     movdl(vec1, ch);
3316     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3317     vpxor(vec2, vec2);
3318     movl(tmp, cnt1);
3319     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3320     andl(cnt1,0x0000000F);  //tail count (in chars)
3321 
3322     bind(SCAN_TO_16_CHAR_LOOP);
3323     vmovdqu(vec3, Address(result, 0));
3324     vpcmpeqw(vec3, vec3, vec1, 1);
3325     vptest(vec2, vec3);
3326     jcc(Assembler::carryClear, FOUND_CHAR);
3327     addptr(result, 32);
3328     subl(tmp, 2*stride);
3329     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3330     jmp(SCAN_TO_8_CHAR);
3331     bind(SCAN_TO_8_CHAR_INIT);
3332     movdl(vec1, ch);
3333     pshuflw(vec1, vec1, 0x00);
3334     pshufd(vec1, vec1, 0);
3335     pxor(vec2, vec2);
3336   }
3337   bind(SCAN_TO_8_CHAR);
3338   cmpl(cnt1, stride);
3339   jcc(Assembler::less, SCAN_TO_CHAR);
3340   if (UseAVX < 2) {
3341     movdl(vec1, ch);
3342     pshuflw(vec1, vec1, 0x00);
3343     pshufd(vec1, vec1, 0);
3344     pxor(vec2, vec2);
3345   }
3346   movl(tmp, cnt1);
3347   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3348   andl(cnt1,0x00000007);  //tail count (in chars)
3349 
3350   bind(SCAN_TO_8_CHAR_LOOP);
3351   movdqu(vec3, Address(result, 0));
3352   pcmpeqw(vec3, vec1);
3353   ptest(vec2, vec3);
3354   jcc(Assembler::carryClear, FOUND_CHAR);
3355   addptr(result, 16);
3356   subl(tmp, stride);
3357   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3358   bind(SCAN_TO_CHAR);
3359   testl(cnt1, cnt1);
3360   jcc(Assembler::zero, RET_NOT_FOUND);
3361   bind(SCAN_TO_CHAR_LOOP);
3362   load_unsigned_short(tmp, Address(result, 0));
3363   cmpl(ch, tmp);
3364   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3365   addptr(result, 2);
3366   subl(cnt1, 1);
3367   jccb(Assembler::zero, RET_NOT_FOUND);
3368   jmp(SCAN_TO_CHAR_LOOP);
3369 
3370   bind(RET_NOT_FOUND);
3371   movl(result, -1);
3372   jmpb(DONE_LABEL);
3373 
3374   bind(FOUND_CHAR);
3375   if (UseAVX >= 2) {
3376     vpmovmskb(tmp, vec3);
3377   } else {
3378     pmovmskb(tmp, vec3);
3379   }
3380   bsfl(ch, tmp);
3381   addptr(result, ch);
3382 
3383   bind(FOUND_SEQ_CHAR);
3384   subptr(result, str1);
3385   shrl(result, 1);
3386 
3387   bind(DONE_LABEL);
3388 } // string_indexof_char
3389 
3390 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3391                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3392   ShortBranchVerifier sbv(this);
3393   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3394 
3395   int stride = 16;
3396 
3397   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3398         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3399         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3400         FOUND_SEQ_CHAR, DONE_LABEL;
3401 
3402   movptr(result, str1);
3403   if (UseAVX >= 2) {
3404     cmpl(cnt1, stride);
3405     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3406     cmpl(cnt1, stride*2);
3407     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3408     movdl(vec1, ch);
3409     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3410     vpxor(vec2, vec2);
3411     movl(tmp, cnt1);
3412     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3413     andl(cnt1,0x0000001F);  //tail count (in chars)
3414 
3415     bind(SCAN_TO_32_CHAR_LOOP);
3416     vmovdqu(vec3, Address(result, 0));
3417     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3418     vptest(vec2, vec3);
3419     jcc(Assembler::carryClear, FOUND_CHAR);
3420     addptr(result, 32);
3421     subl(tmp, stride*2);
3422     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3423     jmp(SCAN_TO_16_CHAR);
3424 
3425     bind(SCAN_TO_16_CHAR_INIT);
3426     movdl(vec1, ch);
3427     pxor(vec2, vec2);
3428     pshufb(vec1, vec2);
3429   }
3430 
3431   bind(SCAN_TO_16_CHAR);
3432   cmpl(cnt1, stride);
3433   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3434   if (UseAVX < 2) {
3435     movdl(vec1, ch);
3436     pxor(vec2, vec2);
3437     pshufb(vec1, vec2);
3438   }
3439   movl(tmp, cnt1);
3440   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3441   andl(cnt1,0x0000000F);  //tail count (in bytes)
3442 
3443   bind(SCAN_TO_16_CHAR_LOOP);
3444   movdqu(vec3, Address(result, 0));
3445   pcmpeqb(vec3, vec1);
3446   ptest(vec2, vec3);
3447   jcc(Assembler::carryClear, FOUND_CHAR);
3448   addptr(result, 16);
3449   subl(tmp, stride);
3450   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3451 
3452   bind(SCAN_TO_CHAR_INIT);
3453   testl(cnt1, cnt1);
3454   jcc(Assembler::zero, RET_NOT_FOUND);
3455   bind(SCAN_TO_CHAR_LOOP);
3456   load_unsigned_byte(tmp, Address(result, 0));
3457   cmpl(ch, tmp);
3458   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3459   addptr(result, 1);
3460   subl(cnt1, 1);
3461   jccb(Assembler::zero, RET_NOT_FOUND);
3462   jmp(SCAN_TO_CHAR_LOOP);
3463 
3464   bind(RET_NOT_FOUND);
3465   movl(result, -1);
3466   jmpb(DONE_LABEL);
3467 
3468   bind(FOUND_CHAR);
3469   if (UseAVX >= 2) {
3470     vpmovmskb(tmp, vec3);
3471   } else {
3472     pmovmskb(tmp, vec3);
3473   }
3474   bsfl(ch, tmp);
3475   addptr(result, ch);
3476 
3477   bind(FOUND_SEQ_CHAR);
3478   subptr(result, str1);
3479 
3480   bind(DONE_LABEL);
3481 } // stringL_indexof_char
3482 
3483 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3484   switch (eltype) {
3485   case T_BOOLEAN: return sizeof(jboolean);
3486   case T_BYTE:  return sizeof(jbyte);
3487   case T_SHORT: return sizeof(jshort);
3488   case T_CHAR:  return sizeof(jchar);
3489   case T_INT:   return sizeof(jint);
3490   default:
3491     ShouldNotReachHere();
3492     return -1;
3493   }
3494 }
3495 
3496 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3497   switch (eltype) {
3498   // T_BOOLEAN used as surrogate for unsigned byte
3499   case T_BOOLEAN: movzbl(dst, src);   break;
3500   case T_BYTE:    movsbl(dst, src);   break;
3501   case T_SHORT:   movswl(dst, src);   break;
3502   case T_CHAR:    movzwl(dst, src);   break;
3503   case T_INT:     movl(dst, src);     break;
3504   default:
3505     ShouldNotReachHere();
3506   }
3507 }
3508 
3509 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3510   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3511 }
3512 
3513 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3514   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3515 }
3516 
3517 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3518   const int vlen = Assembler::AVX_256bit;
3519   switch (eltype) {
3520   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3521   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3522   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3523   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3524   case T_INT:
3525     // do nothing
3526     break;
3527   default:
3528     ShouldNotReachHere();
3529   }
3530 }
3531 
3532 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3533                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3534                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3535                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3536                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3537                                         BasicType eltype) {
3538   ShortBranchVerifier sbv(this);
3539   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3540   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3541   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3542 
3543   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3544         SHORT_UNROLLED_LOOP_EXIT,
3545         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3546         UNROLLED_VECTOR_LOOP_BEGIN,
3547         END;
3548   switch (eltype) {
3549   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3550   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3551   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3552   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3553   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3554   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3555   }
3556 
3557   // For "renaming" for readibility of the code
3558   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3559                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3560                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3561 
3562   const int elsize = arrays_hashcode_elsize(eltype);
3563 
3564   /*
3565     if (cnt1 >= 2) {
3566       if (cnt1 >= 32) {
3567         UNROLLED VECTOR LOOP
3568       }
3569       UNROLLED SCALAR LOOP
3570     }
3571     SINGLE SCALAR
3572    */
3573 
3574   cmpl(cnt1, 32);
3575   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3576 
3577   // cnt1 >= 32 && generate_vectorized_loop
3578   xorl(index, index);
3579 
3580   // vresult = IntVector.zero(I256);
3581   for (int idx = 0; idx < 4; idx++) {
3582     vpxor(vresult[idx], vresult[idx]);
3583   }
3584   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3585   Register bound = tmp2;
3586   Register next = tmp3;
3587   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3588   movl(next, Address(tmp2, 0));
3589   movdl(vnext, next);
3590   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3591 
3592   // index = 0;
3593   // bound = cnt1 & ~(32 - 1);
3594   movl(bound, cnt1);
3595   andl(bound, ~(32 - 1));
3596   // for (; index < bound; index += 32) {
3597   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3598   // result *= next;
3599   imull(result, next);
3600   // loop fission to upfront the cost of fetching from memory, OOO execution
3601   // can then hopefully do a better job of prefetching
3602   for (int idx = 0; idx < 4; idx++) {
3603     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3604   }
3605   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3606   for (int idx = 0; idx < 4; idx++) {
3607     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3608     arrays_hashcode_elvcast(vtmp[idx], eltype);
3609     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3610   }
3611   // index += 32;
3612   addl(index, 32);
3613   // index < bound;
3614   cmpl(index, bound);
3615   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3616   // }
3617 
3618   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3619   subl(cnt1, bound);
3620   // release bound
3621 
3622   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3623   for (int idx = 0; idx < 4; idx++) {
3624     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3625     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3626     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3627   }
3628   // result += vresult.reduceLanes(ADD);
3629   for (int idx = 0; idx < 4; idx++) {
3630     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3631   }
3632 
3633   // } else if (cnt1 < 32) {
3634 
3635   bind(SHORT_UNROLLED_BEGIN);
3636   // int i = 1;
3637   movl(index, 1);
3638   cmpl(index, cnt1);
3639   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3640 
3641   // for (; i < cnt1 ; i += 2) {
3642   bind(SHORT_UNROLLED_LOOP_BEGIN);
3643   movl(tmp3, 961);
3644   imull(result, tmp3);
3645   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3646   movl(tmp3, tmp2);
3647   shll(tmp3, 5);
3648   subl(tmp3, tmp2);
3649   addl(result, tmp3);
3650   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3651   addl(result, tmp3);
3652   addl(index, 2);
3653   cmpl(index, cnt1);
3654   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3655 
3656   // }
3657   // if (i >= cnt1) {
3658   bind(SHORT_UNROLLED_LOOP_EXIT);
3659   jccb(Assembler::greater, END);
3660   movl(tmp2, result);
3661   shll(result, 5);
3662   subl(result, tmp2);
3663   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3664   addl(result, tmp3);
3665   // }
3666   bind(END);
3667 
3668   BLOCK_COMMENT("} // arrays_hashcode");
3669 
3670 } // arrays_hashcode
3671 
3672 // helper function for string_compare
3673 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3674                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3675                                            Address::ScaleFactor scale2, Register index, int ae) {
3676   if (ae == StrIntrinsicNode::LL) {
3677     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3678     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3679   } else if (ae == StrIntrinsicNode::UU) {
3680     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3681     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3682   } else {
3683     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3684     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3685   }
3686 }
3687 
3688 // Compare strings, used for char[] and byte[].
3689 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3690                                        Register cnt1, Register cnt2, Register result,
3691                                        XMMRegister vec1, int ae, KRegister mask) {
3692   ShortBranchVerifier sbv(this);
3693   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3694   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only AVX3
3695   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3696   int stride2x2 = 0x40;
3697   Address::ScaleFactor scale = Address::no_scale;
3698   Address::ScaleFactor scale1 = Address::no_scale;
3699   Address::ScaleFactor scale2 = Address::no_scale;
3700 
3701   if (ae != StrIntrinsicNode::LL) {
3702     stride2x2 = 0x20;
3703   }
3704 
3705   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3706     shrl(cnt2, 1);
3707   }
3708   // Compute the minimum of the string lengths and the
3709   // difference of the string lengths (stack).
3710   // Do the conditional move stuff
3711   movl(result, cnt1);
3712   subl(cnt1, cnt2);
3713   push(cnt1);
3714   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3715 
3716   // Is the minimum length zero?
3717   testl(cnt2, cnt2);
3718   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3719   if (ae == StrIntrinsicNode::LL) {
3720     // Load first bytes
3721     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3722     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3723   } else if (ae == StrIntrinsicNode::UU) {
3724     // Load first characters
3725     load_unsigned_short(result, Address(str1, 0));
3726     load_unsigned_short(cnt1, Address(str2, 0));
3727   } else {
3728     load_unsigned_byte(result, Address(str1, 0));
3729     load_unsigned_short(cnt1, Address(str2, 0));
3730   }
3731   subl(result, cnt1);
3732   jcc(Assembler::notZero,  POP_LABEL);
3733 
3734   if (ae == StrIntrinsicNode::UU) {
3735     // Divide length by 2 to get number of chars
3736     shrl(cnt2, 1);
3737   }
3738   cmpl(cnt2, 1);
3739   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3740 
3741   // Check if the strings start at the same location and setup scale and stride
3742   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3743     cmpptr(str1, str2);
3744     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3745     if (ae == StrIntrinsicNode::LL) {
3746       scale = Address::times_1;
3747       stride = 16;
3748     } else {
3749       scale = Address::times_2;
3750       stride = 8;
3751     }
3752   } else {
3753     scale1 = Address::times_1;
3754     scale2 = Address::times_2;
3755     // scale not used
3756     stride = 8;
3757   }
3758 
3759   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3760     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3761     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3762     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3763     Label COMPARE_TAIL_LONG;
3764     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only AVX3
3765 
3766     int pcmpmask = 0x19;
3767     if (ae == StrIntrinsicNode::LL) {
3768       pcmpmask &= ~0x01;
3769     }
3770 
3771     // Setup to compare 16-chars (32-bytes) vectors,
3772     // start from first character again because it has aligned address.
3773     if (ae == StrIntrinsicNode::LL) {
3774       stride2 = 32;
3775     } else {
3776       stride2 = 16;
3777     }
3778     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3779       adr_stride = stride << scale;
3780     } else {
3781       adr_stride1 = 8;  //stride << scale1;
3782       adr_stride2 = 16; //stride << scale2;
3783     }
3784 
3785     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3786     // rax and rdx are used by pcmpestri as elements counters
3787     movl(result, cnt2);
3788     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3789     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3790 
3791     // fast path : compare first 2 8-char vectors.
3792     bind(COMPARE_16_CHARS);
3793     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3794       movdqu(vec1, Address(str1, 0));
3795     } else {
3796       pmovzxbw(vec1, Address(str1, 0));
3797     }
3798     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3799     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3800 
3801     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3802       movdqu(vec1, Address(str1, adr_stride));
3803       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3804     } else {
3805       pmovzxbw(vec1, Address(str1, adr_stride1));
3806       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3807     }
3808     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3809     addl(cnt1, stride);
3810 
3811     // Compare the characters at index in cnt1
3812     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3813     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3814     subl(result, cnt2);
3815     jmp(POP_LABEL);
3816 
3817     // Setup the registers to start vector comparison loop
3818     bind(COMPARE_WIDE_VECTORS);
3819     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3820       lea(str1, Address(str1, result, scale));
3821       lea(str2, Address(str2, result, scale));
3822     } else {
3823       lea(str1, Address(str1, result, scale1));
3824       lea(str2, Address(str2, result, scale2));
3825     }
3826     subl(result, stride2);
3827     subl(cnt2, stride2);
3828     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3829     negptr(result);
3830 
3831     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3832     bind(COMPARE_WIDE_VECTORS_LOOP);
3833 
3834     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3835       cmpl(cnt2, stride2x2);
3836       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3837       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3838       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3839 
3840       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3841       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3842         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3843         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3844       } else {
3845         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3846         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3847       }
3848       kortestql(mask, mask);
3849       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3850       addptr(result, stride2x2);  // update since we already compared at this addr
3851       subl(cnt2, stride2x2);      // and sub the size too
3852       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3853 
3854       vpxor(vec1, vec1);
3855       jmpb(COMPARE_WIDE_TAIL);
3856     }//if (VM_Version::supports_avx512vlbw())
3857 
3858     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3859     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3860       vmovdqu(vec1, Address(str1, result, scale));
3861       vpxor(vec1, Address(str2, result, scale));
3862     } else {
3863       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3864       vpxor(vec1, Address(str2, result, scale2));
3865     }
3866     vptest(vec1, vec1);
3867     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3868     addptr(result, stride2);
3869     subl(cnt2, stride2);
3870     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3871     // clean upper bits of YMM registers
3872     vpxor(vec1, vec1);
3873 
3874     // compare wide vectors tail
3875     bind(COMPARE_WIDE_TAIL);
3876     testptr(result, result);
3877     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3878 
3879     movl(result, stride2);
3880     movl(cnt2, result);
3881     negptr(result);
3882     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3883 
3884     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3885     bind(VECTOR_NOT_EQUAL);
3886     // clean upper bits of YMM registers
3887     vpxor(vec1, vec1);
3888     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3889       lea(str1, Address(str1, result, scale));
3890       lea(str2, Address(str2, result, scale));
3891     } else {
3892       lea(str1, Address(str1, result, scale1));
3893       lea(str2, Address(str2, result, scale2));
3894     }
3895     jmp(COMPARE_16_CHARS);
3896 
3897     // Compare tail chars, length between 1 to 15 chars
3898     bind(COMPARE_TAIL_LONG);
3899     movl(cnt2, result);
3900     cmpl(cnt2, stride);
3901     jcc(Assembler::less, COMPARE_SMALL_STR);
3902 
3903     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3904       movdqu(vec1, Address(str1, 0));
3905     } else {
3906       pmovzxbw(vec1, Address(str1, 0));
3907     }
3908     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3909     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3910     subptr(cnt2, stride);
3911     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3912     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3913       lea(str1, Address(str1, result, scale));
3914       lea(str2, Address(str2, result, scale));
3915     } else {
3916       lea(str1, Address(str1, result, scale1));
3917       lea(str2, Address(str2, result, scale2));
3918     }
3919     negptr(cnt2);
3920     jmpb(WHILE_HEAD_LABEL);
3921 
3922     bind(COMPARE_SMALL_STR);
3923   } else if (UseSSE42Intrinsics) {
3924     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3925     int pcmpmask = 0x19;
3926     // Setup to compare 8-char (16-byte) vectors,
3927     // start from first character again because it has aligned address.
3928     movl(result, cnt2);
3929     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3930     if (ae == StrIntrinsicNode::LL) {
3931       pcmpmask &= ~0x01;
3932     }
3933     jcc(Assembler::zero, COMPARE_TAIL);
3934     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3935       lea(str1, Address(str1, result, scale));
3936       lea(str2, Address(str2, result, scale));
3937     } else {
3938       lea(str1, Address(str1, result, scale1));
3939       lea(str2, Address(str2, result, scale2));
3940     }
3941     negptr(result);
3942 
3943     // pcmpestri
3944     //   inputs:
3945     //     vec1- substring
3946     //     rax - negative string length (elements count)
3947     //     mem - scanned string
3948     //     rdx - string length (elements count)
3949     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3950     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3951     //   outputs:
3952     //     rcx - first mismatched element index
3953     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3954 
3955     bind(COMPARE_WIDE_VECTORS);
3956     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3957       movdqu(vec1, Address(str1, result, scale));
3958       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3959     } else {
3960       pmovzxbw(vec1, Address(str1, result, scale1));
3961       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3962     }
3963     // After pcmpestri cnt1(rcx) contains mismatched element index
3964 
3965     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3966     addptr(result, stride);
3967     subptr(cnt2, stride);
3968     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3969 
3970     // compare wide vectors tail
3971     testptr(result, result);
3972     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3973 
3974     movl(cnt2, stride);
3975     movl(result, stride);
3976     negptr(result);
3977     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3978       movdqu(vec1, Address(str1, result, scale));
3979       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3980     } else {
3981       pmovzxbw(vec1, Address(str1, result, scale1));
3982       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3983     }
3984     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3985 
3986     // Mismatched characters in the vectors
3987     bind(VECTOR_NOT_EQUAL);
3988     addptr(cnt1, result);
3989     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3990     subl(result, cnt2);
3991     jmpb(POP_LABEL);
3992 
3993     bind(COMPARE_TAIL); // limit is zero
3994     movl(cnt2, result);
3995     // Fallthru to tail compare
3996   }
3997   // Shift str2 and str1 to the end of the arrays, negate min
3998   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3999     lea(str1, Address(str1, cnt2, scale));
4000     lea(str2, Address(str2, cnt2, scale));
4001   } else {
4002     lea(str1, Address(str1, cnt2, scale1));
4003     lea(str2, Address(str2, cnt2, scale2));
4004   }
4005   decrementl(cnt2);  // first character was compared already
4006   negptr(cnt2);
4007 
4008   // Compare the rest of the elements
4009   bind(WHILE_HEAD_LABEL);
4010   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
4011   subl(result, cnt1);
4012   jccb(Assembler::notZero, POP_LABEL);
4013   increment(cnt2);
4014   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
4015 
4016   // Strings are equal up to min length.  Return the length difference.
4017   bind(LENGTH_DIFF_LABEL);
4018   pop(result);
4019   if (ae == StrIntrinsicNode::UU) {
4020     // Divide diff by 2 to get number of chars
4021     sarl(result, 1);
4022   }
4023   jmpb(DONE_LABEL);
4024 
4025   if (VM_Version::supports_avx512vlbw()) {
4026 
4027     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
4028 
4029     kmovql(cnt1, mask);
4030     notq(cnt1);
4031     bsfq(cnt2, cnt1);
4032     if (ae != StrIntrinsicNode::LL) {
4033       // Divide diff by 2 to get number of chars
4034       sarl(cnt2, 1);
4035     }
4036     addq(result, cnt2);
4037     if (ae == StrIntrinsicNode::LL) {
4038       load_unsigned_byte(cnt1, Address(str2, result));
4039       load_unsigned_byte(result, Address(str1, result));
4040     } else if (ae == StrIntrinsicNode::UU) {
4041       load_unsigned_short(cnt1, Address(str2, result, scale));
4042       load_unsigned_short(result, Address(str1, result, scale));
4043     } else {
4044       load_unsigned_short(cnt1, Address(str2, result, scale2));
4045       load_unsigned_byte(result, Address(str1, result, scale1));
4046     }
4047     subl(result, cnt1);
4048     jmpb(POP_LABEL);
4049   }//if (VM_Version::supports_avx512vlbw())
4050 
4051   // Discard the stored length difference
4052   bind(POP_LABEL);
4053   pop(cnt1);
4054 
4055   // That's it
4056   bind(DONE_LABEL);
4057   if(ae == StrIntrinsicNode::UL) {
4058     negl(result);
4059   }
4060 
4061 }
4062 
4063 // Search for Non-ASCII character (Negative byte value) in a byte array,
4064 // return the index of the first such character, otherwise the length
4065 // of the array segment searched.
4066 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4067 //   @IntrinsicCandidate
4068 //   public static int countPositives(byte[] ba, int off, int len) {
4069 //     for (int i = off; i < off + len; i++) {
4070 //       if (ba[i] < 0) {
4071 //         return i - off;
4072 //       }
4073 //     }
4074 //     return len;
4075 //   }
4076 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4077   Register result, Register tmp1,
4078   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4079   // rsi: byte array
4080   // rcx: len
4081   // rax: result
4082   ShortBranchVerifier sbv(this);
4083   assert_different_registers(ary1, len, result, tmp1);
4084   assert_different_registers(vec1, vec2);
4085   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4086 
4087   movl(result, len); // copy
4088   // len == 0
4089   testl(len, len);
4090   jcc(Assembler::zero, DONE);
4091 
4092   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4093     VM_Version::supports_avx512vlbw() &&
4094     VM_Version::supports_bmi2()) {
4095 
4096     Label test_64_loop, test_tail, BREAK_LOOP;
4097     movl(tmp1, len);
4098     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4099 
4100     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4101     andl(len,  0xffffffc0); // vector count (in chars)
4102     jccb(Assembler::zero, test_tail);
4103 
4104     lea(ary1, Address(ary1, len, Address::times_1));
4105     negptr(len);
4106 
4107     bind(test_64_loop);
4108     // Check whether our 64 elements of size byte contain negatives
4109     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4110     kortestql(mask1, mask1);
4111     jcc(Assembler::notZero, BREAK_LOOP);
4112 
4113     addptr(len, 64);
4114     jccb(Assembler::notZero, test_64_loop);
4115 
4116     bind(test_tail);
4117     // bail out when there is nothing to be done
4118     testl(tmp1, -1);
4119     jcc(Assembler::zero, DONE);
4120 
4121 
4122     // check the tail for absense of negatives
4123     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4124     {
4125       Register tmp3_aliased = len;
4126       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4127       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4128       notq(tmp3_aliased);
4129       kmovql(mask2, tmp3_aliased);
4130     }
4131 
4132     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4133     ktestq(mask1, mask2);
4134     jcc(Assembler::zero, DONE);
4135 
4136     // do a full check for negative registers in the tail
4137     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4138                      // ary1 already pointing to the right place
4139     jmpb(TAIL_START);
4140 
4141     bind(BREAK_LOOP);
4142     // At least one byte in the last 64 byte block was negative.
4143     // Set up to look at the last 64 bytes as if they were a tail
4144     lea(ary1, Address(ary1, len, Address::times_1));
4145     addptr(result, len);
4146     // Ignore the very last byte: if all others are positive,
4147     // it must be negative, so we can skip right to the 2+1 byte
4148     // end comparison at this point
4149     orl(result, 63);
4150     movl(len, 63);
4151     // Fallthru to tail compare
4152   } else {
4153 
4154     if (UseAVX >= 2) {
4155       // With AVX2, use 32-byte vector compare
4156       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4157 
4158       // Compare 32-byte vectors
4159       testl(len, 0xffffffe0);   // vector count (in bytes)
4160       jccb(Assembler::zero, TAIL_START);
4161 
4162       andl(len, 0xffffffe0);
4163       lea(ary1, Address(ary1, len, Address::times_1));
4164       negptr(len);
4165 
4166       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4167       movdl(vec2, tmp1);
4168       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4169 
4170       bind(COMPARE_WIDE_VECTORS);
4171       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4172       vptest(vec1, vec2);
4173       jccb(Assembler::notZero, BREAK_LOOP);
4174       addptr(len, 32);
4175       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4176 
4177       testl(result, 0x0000001f);   // any bytes remaining?
4178       jcc(Assembler::zero, DONE);
4179 
4180       // Quick test using the already prepared vector mask
4181       movl(len, result);
4182       andl(len, 0x0000001f);
4183       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4184       vptest(vec1, vec2);
4185       jcc(Assembler::zero, DONE);
4186       // There are zeros, jump to the tail to determine exactly where
4187       jmpb(TAIL_START);
4188 
4189       bind(BREAK_LOOP);
4190       // At least one byte in the last 32-byte vector is negative.
4191       // Set up to look at the last 32 bytes as if they were a tail
4192       lea(ary1, Address(ary1, len, Address::times_1));
4193       addptr(result, len);
4194       // Ignore the very last byte: if all others are positive,
4195       // it must be negative, so we can skip right to the 2+1 byte
4196       // end comparison at this point
4197       orl(result, 31);
4198       movl(len, 31);
4199       // Fallthru to tail compare
4200     } else if (UseSSE42Intrinsics) {
4201       // With SSE4.2, use double quad vector compare
4202       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4203 
4204       // Compare 16-byte vectors
4205       testl(len, 0xfffffff0);   // vector count (in bytes)
4206       jcc(Assembler::zero, TAIL_START);
4207 
4208       andl(len, 0xfffffff0);
4209       lea(ary1, Address(ary1, len, Address::times_1));
4210       negptr(len);
4211 
4212       movl(tmp1, 0x80808080);
4213       movdl(vec2, tmp1);
4214       pshufd(vec2, vec2, 0);
4215 
4216       bind(COMPARE_WIDE_VECTORS);
4217       movdqu(vec1, Address(ary1, len, Address::times_1));
4218       ptest(vec1, vec2);
4219       jccb(Assembler::notZero, BREAK_LOOP);
4220       addptr(len, 16);
4221       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4222 
4223       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4224       jcc(Assembler::zero, DONE);
4225 
4226       // Quick test using the already prepared vector mask
4227       movl(len, result);
4228       andl(len, 0x0000000f);   // tail count (in bytes)
4229       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4230       ptest(vec1, vec2);
4231       jcc(Assembler::zero, DONE);
4232       jmpb(TAIL_START);
4233 
4234       bind(BREAK_LOOP);
4235       // At least one byte in the last 16-byte vector is negative.
4236       // Set up and look at the last 16 bytes as if they were a tail
4237       lea(ary1, Address(ary1, len, Address::times_1));
4238       addptr(result, len);
4239       // Ignore the very last byte: if all others are positive,
4240       // it must be negative, so we can skip right to the 2+1 byte
4241       // end comparison at this point
4242       orl(result, 15);
4243       movl(len, 15);
4244       // Fallthru to tail compare
4245     }
4246   }
4247 
4248   bind(TAIL_START);
4249   // Compare 4-byte vectors
4250   andl(len, 0xfffffffc); // vector count (in bytes)
4251   jccb(Assembler::zero, COMPARE_CHAR);
4252 
4253   lea(ary1, Address(ary1, len, Address::times_1));
4254   negptr(len);
4255 
4256   bind(COMPARE_VECTORS);
4257   movl(tmp1, Address(ary1, len, Address::times_1));
4258   andl(tmp1, 0x80808080);
4259   jccb(Assembler::notZero, TAIL_ADJUST);
4260   addptr(len, 4);
4261   jccb(Assembler::notZero, COMPARE_VECTORS);
4262 
4263   // Compare trailing char (final 2-3 bytes), if any
4264   bind(COMPARE_CHAR);
4265 
4266   testl(result, 0x2);   // tail  char
4267   jccb(Assembler::zero, COMPARE_BYTE);
4268   load_unsigned_short(tmp1, Address(ary1, 0));
4269   andl(tmp1, 0x00008080);
4270   jccb(Assembler::notZero, CHAR_ADJUST);
4271   lea(ary1, Address(ary1, 2));
4272 
4273   bind(COMPARE_BYTE);
4274   testl(result, 0x1);   // tail  byte
4275   jccb(Assembler::zero, DONE);
4276   load_unsigned_byte(tmp1, Address(ary1, 0));
4277   testl(tmp1, 0x00000080);
4278   jccb(Assembler::zero, DONE);
4279   subptr(result, 1);
4280   jmpb(DONE);
4281 
4282   bind(TAIL_ADJUST);
4283   // there are negative bits in the last 4 byte block.
4284   // Adjust result and check the next three bytes
4285   addptr(result, len);
4286   orl(result, 3);
4287   lea(ary1, Address(ary1, len, Address::times_1));
4288   jmpb(COMPARE_CHAR);
4289 
4290   bind(CHAR_ADJUST);
4291   // We are looking at a char + optional byte tail, and found that one
4292   // of the bytes in the char is negative. Adjust the result, check the
4293   // first byte and readjust if needed.
4294   andl(result, 0xfffffffc);
4295   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4296   jccb(Assembler::notZero, DONE);
4297   addptr(result, 1);
4298 
4299   // That's it
4300   bind(DONE);
4301   if (UseAVX >= 2) {
4302     // clean upper bits of YMM registers
4303     vpxor(vec1, vec1);
4304     vpxor(vec2, vec2);
4305   }
4306 }
4307 
4308 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4309 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4310                                       Register limit, Register result, Register chr,
4311                                       XMMRegister vec1, XMMRegister vec2, bool is_char,
4312                                       KRegister mask, bool expand_ary2) {
4313   // for expand_ary2, limit is the (smaller) size of the second array.
4314   ShortBranchVerifier sbv(this);
4315   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4316 
4317   assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4318          "Expansion only implemented for AVX2");
4319 
4320   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4321   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4322 
4323   Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4324   int scaleIncr = expand_ary2 ? 8 : 16;
4325 
4326   if (is_array_equ) {
4327     // Check the input args
4328     cmpoop(ary1, ary2);
4329     jcc(Assembler::equal, TRUE_LABEL);
4330 
4331     // Need additional checks for arrays_equals.
4332     testptr(ary1, ary1);
4333     jcc(Assembler::zero, FALSE_LABEL);
4334     testptr(ary2, ary2);
4335     jcc(Assembler::zero, FALSE_LABEL);
4336 
4337     // Check the lengths
4338     movl(limit, Address(ary1, length_offset));
4339     cmpl(limit, Address(ary2, length_offset));
4340     jcc(Assembler::notEqual, FALSE_LABEL);
4341   }
4342 
4343   // count == 0
4344   testl(limit, limit);
4345   jcc(Assembler::zero, TRUE_LABEL);
4346 
4347   if (is_array_equ) {
4348     // Load array address
4349     lea(ary1, Address(ary1, base_offset));
4350     lea(ary2, Address(ary2, base_offset));
4351   }
4352 
4353   if (is_array_equ && is_char) {
4354     // arrays_equals when used for char[].
4355     shll(limit, 1);      // byte count != 0
4356   }
4357   movl(result, limit); // copy
4358 
4359   if (UseAVX >= 2) {
4360     // With AVX2, use 32-byte vector compare
4361     Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4362 
4363     // Compare 32-byte vectors
4364     if (expand_ary2) {
4365       andl(result, 0x0000000f);  //   tail count (in bytes)
4366       andl(limit, 0xfffffff0);   // vector count (in bytes)
4367       jcc(Assembler::zero, COMPARE_TAIL);
4368     } else {
4369       andl(result, 0x0000001f);  //   tail count (in bytes)
4370       andl(limit, 0xffffffe0);   // vector count (in bytes)
4371       jcc(Assembler::zero, COMPARE_TAIL_16);
4372     }
4373 
4374     lea(ary1, Address(ary1, limit, scaleFactor));
4375     lea(ary2, Address(ary2, limit, Address::times_1));
4376     negptr(limit);
4377 
4378     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4379       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4380 
4381       cmpl(limit, -64);
4382       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4383 
4384       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4385 
4386       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4387       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4388       kortestql(mask, mask);
4389       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4390       addptr(limit, 64);  // update since we already compared at this addr
4391       cmpl(limit, -64);
4392       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4393 
4394       // At this point we may still need to compare -limit+result bytes.
4395       // We could execute the next two instruction and just continue via non-wide path:
4396       //  cmpl(limit, 0);
4397       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4398       // But since we stopped at the points ary{1,2}+limit which are
4399       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4400       // (|limit| <= 32 and result < 32),
4401       // we may just compare the last 64 bytes.
4402       //
4403       addptr(result, -64);   // it is safe, bc we just came from this area
4404       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4405       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4406       kortestql(mask, mask);
4407       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4408 
4409       jmp(TRUE_LABEL);
4410 
4411       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4412 
4413     }//if (VM_Version::supports_avx512vlbw())
4414 
4415     bind(COMPARE_WIDE_VECTORS);
4416     vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4417     if (expand_ary2) {
4418       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4419     } else {
4420       vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4421     }
4422     vpxor(vec1, vec2);
4423 
4424     vptest(vec1, vec1);
4425     jcc(Assembler::notZero, FALSE_LABEL);
4426     addptr(limit, scaleIncr * 2);
4427     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4428 
4429     testl(result, result);
4430     jcc(Assembler::zero, TRUE_LABEL);
4431 
4432     vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4433     if (expand_ary2) {
4434       vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4435     } else {
4436       vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4437     }
4438     vpxor(vec1, vec2);
4439 
4440     vptest(vec1, vec1);
4441     jcc(Assembler::notZero, FALSE_LABEL);
4442     jmp(TRUE_LABEL);
4443 
4444     bind(COMPARE_TAIL_16); // limit is zero
4445     movl(limit, result);
4446 
4447     // Compare 16-byte chunks
4448     andl(result, 0x0000000f);  //   tail count (in bytes)
4449     andl(limit, 0xfffffff0);   // vector count (in bytes)
4450     jcc(Assembler::zero, COMPARE_TAIL);
4451 
4452     lea(ary1, Address(ary1, limit, scaleFactor));
4453     lea(ary2, Address(ary2, limit, Address::times_1));
4454     negptr(limit);
4455 
4456     bind(COMPARE_WIDE_VECTORS_16);
4457     movdqu(vec1, Address(ary1, limit, scaleFactor));
4458     if (expand_ary2) {
4459       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4460     } else {
4461       movdqu(vec2, Address(ary2, limit, Address::times_1));
4462     }
4463     pxor(vec1, vec2);
4464 
4465     ptest(vec1, vec1);
4466     jcc(Assembler::notZero, FALSE_LABEL);
4467     addptr(limit, scaleIncr);
4468     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4469 
4470     bind(COMPARE_TAIL); // limit is zero
4471     movl(limit, result);
4472     // Fallthru to tail compare
4473   } else if (UseSSE42Intrinsics) {
4474     // With SSE4.2, use double quad vector compare
4475     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4476 
4477     // Compare 16-byte vectors
4478     andl(result, 0x0000000f);  //   tail count (in bytes)
4479     andl(limit, 0xfffffff0);   // vector count (in bytes)
4480     jcc(Assembler::zero, COMPARE_TAIL);
4481 
4482     lea(ary1, Address(ary1, limit, Address::times_1));
4483     lea(ary2, Address(ary2, limit, Address::times_1));
4484     negptr(limit);
4485 
4486     bind(COMPARE_WIDE_VECTORS);
4487     movdqu(vec1, Address(ary1, limit, Address::times_1));
4488     movdqu(vec2, Address(ary2, limit, Address::times_1));
4489     pxor(vec1, vec2);
4490 
4491     ptest(vec1, vec1);
4492     jcc(Assembler::notZero, FALSE_LABEL);
4493     addptr(limit, 16);
4494     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4495 
4496     testl(result, result);
4497     jcc(Assembler::zero, TRUE_LABEL);
4498 
4499     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4500     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4501     pxor(vec1, vec2);
4502 
4503     ptest(vec1, vec1);
4504     jccb(Assembler::notZero, FALSE_LABEL);
4505     jmpb(TRUE_LABEL);
4506 
4507     bind(COMPARE_TAIL); // limit is zero
4508     movl(limit, result);
4509     // Fallthru to tail compare
4510   }
4511 
4512   // Compare 4-byte vectors
4513   if (expand_ary2) {
4514     testl(result, result);
4515     jccb(Assembler::zero, TRUE_LABEL);
4516   } else {
4517     andl(limit, 0xfffffffc); // vector count (in bytes)
4518     jccb(Assembler::zero, COMPARE_CHAR);
4519   }
4520 
4521   lea(ary1, Address(ary1, limit, scaleFactor));
4522   lea(ary2, Address(ary2, limit, Address::times_1));
4523   negptr(limit);
4524 
4525   bind(COMPARE_VECTORS);
4526   if (expand_ary2) {
4527     // There are no "vector" operations for bytes to shorts
4528     movzbl(chr, Address(ary2, limit, Address::times_1));
4529     cmpw(Address(ary1, limit, Address::times_2), chr);
4530     jccb(Assembler::notEqual, FALSE_LABEL);
4531     addptr(limit, 1);
4532     jcc(Assembler::notZero, COMPARE_VECTORS);
4533     jmp(TRUE_LABEL);
4534   } else {
4535     movl(chr, Address(ary1, limit, Address::times_1));
4536     cmpl(chr, Address(ary2, limit, Address::times_1));
4537     jccb(Assembler::notEqual, FALSE_LABEL);
4538     addptr(limit, 4);
4539     jcc(Assembler::notZero, COMPARE_VECTORS);
4540   }
4541 
4542   // Compare trailing char (final 2 bytes), if any
4543   bind(COMPARE_CHAR);
4544   testl(result, 0x2);   // tail  char
4545   jccb(Assembler::zero, COMPARE_BYTE);
4546   load_unsigned_short(chr, Address(ary1, 0));
4547   load_unsigned_short(limit, Address(ary2, 0));
4548   cmpl(chr, limit);
4549   jccb(Assembler::notEqual, FALSE_LABEL);
4550 
4551   if (is_array_equ && is_char) {
4552     bind(COMPARE_BYTE);
4553   } else {
4554     lea(ary1, Address(ary1, 2));
4555     lea(ary2, Address(ary2, 2));
4556 
4557     bind(COMPARE_BYTE);
4558     testl(result, 0x1);   // tail  byte
4559     jccb(Assembler::zero, TRUE_LABEL);
4560     load_unsigned_byte(chr, Address(ary1, 0));
4561     load_unsigned_byte(limit, Address(ary2, 0));
4562     cmpl(chr, limit);
4563     jccb(Assembler::notEqual, FALSE_LABEL);
4564   }
4565   bind(TRUE_LABEL);
4566   movl(result, 1);   // return true
4567   jmpb(DONE);
4568 
4569   bind(FALSE_LABEL);
4570   xorl(result, result); // return false
4571 
4572   // That's it
4573   bind(DONE);
4574   if (UseAVX >= 2) {
4575     // clean upper bits of YMM registers
4576     vpxor(vec1, vec1);
4577     vpxor(vec2, vec2);
4578   }
4579 }
4580 
4581 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4582 #define __ masm.
4583   Register dst = stub.data<0>();
4584   XMMRegister src = stub.data<1>();
4585   address target = stub.data<2>();
4586   __ bind(stub.entry());
4587   __ subptr(rsp, 8);
4588   __ movdbl(Address(rsp), src);
4589   __ call(RuntimeAddress(target));
4590   // APX REX2 encoding for pop(dst) increases the stub size by 1 byte.
4591   __ pop(dst);
4592   __ jmp(stub.continuation());
4593 #undef __
4594 }
4595 
4596 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4597   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4598   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4599 
4600   address slowpath_target;
4601   if (dst_bt == T_INT) {
4602     if (src_bt == T_FLOAT) {
4603       cvttss2sil(dst, src);
4604       cmpl(dst, 0x80000000);
4605       slowpath_target = StubRoutines::x86::f2i_fixup();
4606     } else {
4607       cvttsd2sil(dst, src);
4608       cmpl(dst, 0x80000000);
4609       slowpath_target = StubRoutines::x86::d2i_fixup();
4610     }
4611   } else {
4612     if (src_bt == T_FLOAT) {
4613       cvttss2siq(dst, src);
4614       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4615       slowpath_target = StubRoutines::x86::f2l_fixup();
4616     } else {
4617       cvttsd2siq(dst, src);
4618       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4619       slowpath_target = StubRoutines::x86::d2l_fixup();
4620     }
4621   }
4622 
4623   // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte.
4624   int max_size = 23 + (UseAPX ? 1 : 0);
4625   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath);
4626   jcc(Assembler::equal, stub->entry());
4627   bind(stub->continuation());
4628 }
4629 
4630 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4631                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4632   switch(ideal_opc) {
4633     case Op_LShiftVS:
4634       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4635     case Op_LShiftVI:
4636       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4637     case Op_LShiftVL:
4638       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4639     case Op_RShiftVS:
4640       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4641     case Op_RShiftVI:
4642       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4643     case Op_RShiftVL:
4644       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4645     case Op_URShiftVS:
4646       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4647     case Op_URShiftVI:
4648       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4649     case Op_URShiftVL:
4650       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4651     case Op_RotateRightV:
4652       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4653     case Op_RotateLeftV:
4654       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4655     default:
4656       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4657       break;
4658   }
4659 }
4660 
4661 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4662                                                XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4663   if (is_unsigned) {
4664     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4665   } else {
4666     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4667   }
4668 }
4669 
4670 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4671                                                       XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4672   switch (elem_bt) {
4673     case T_BYTE:
4674       if (ideal_opc == Op_SaturatingAddV) {
4675         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4676       } else {
4677         assert(ideal_opc == Op_SaturatingSubV, "");
4678         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4679       }
4680       break;
4681     case T_SHORT:
4682       if (ideal_opc == Op_SaturatingAddV) {
4683         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4684       } else {
4685         assert(ideal_opc == Op_SaturatingSubV, "");
4686         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4687       }
4688       break;
4689     default:
4690       fatal("Unsupported type %s", type2name(elem_bt));
4691       break;
4692   }
4693 }
4694 
4695 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4696                                                         XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4697   switch (elem_bt) {
4698     case T_BYTE:
4699       if (ideal_opc == Op_SaturatingAddV) {
4700         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4701       } else {
4702         assert(ideal_opc == Op_SaturatingSubV, "");
4703         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4704       }
4705       break;
4706     case T_SHORT:
4707       if (ideal_opc == Op_SaturatingAddV) {
4708         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4709       } else {
4710         assert(ideal_opc == Op_SaturatingSubV, "");
4711         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4712       }
4713       break;
4714     default:
4715       fatal("Unsupported type %s", type2name(elem_bt));
4716       break;
4717   }
4718 }
4719 
4720 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4721                                                Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4722   if (is_unsigned) {
4723     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4724   } else {
4725     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4726   }
4727 }
4728 
4729 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4730                                                       XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4731   switch (elem_bt) {
4732     case T_BYTE:
4733       if (ideal_opc == Op_SaturatingAddV) {
4734         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4735       } else {
4736         assert(ideal_opc == Op_SaturatingSubV, "");
4737         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4738       }
4739       break;
4740     case T_SHORT:
4741       if (ideal_opc == Op_SaturatingAddV) {
4742         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4743       } else {
4744         assert(ideal_opc == Op_SaturatingSubV, "");
4745         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4746       }
4747       break;
4748     default:
4749       fatal("Unsupported type %s", type2name(elem_bt));
4750       break;
4751   }
4752 }
4753 
4754 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4755                                                         XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4756   switch (elem_bt) {
4757     case T_BYTE:
4758       if (ideal_opc == Op_SaturatingAddV) {
4759         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4760       } else {
4761         assert(ideal_opc == Op_SaturatingSubV, "");
4762         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4763       }
4764       break;
4765     case T_SHORT:
4766       if (ideal_opc == Op_SaturatingAddV) {
4767         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4768       } else {
4769         assert(ideal_opc == Op_SaturatingSubV, "");
4770         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4771       }
4772       break;
4773     default:
4774       fatal("Unsupported type %s", type2name(elem_bt));
4775       break;
4776   }
4777 }
4778 
4779 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4780                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4781                                     bool is_varshift) {
4782   switch (ideal_opc) {
4783     case Op_AddVB:
4784       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4785     case Op_AddVS:
4786       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4787     case Op_AddVI:
4788       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4789     case Op_AddVL:
4790       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4791     case Op_AddVF:
4792       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4793     case Op_AddVD:
4794       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4795     case Op_SubVB:
4796       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4797     case Op_SubVS:
4798       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4799     case Op_SubVI:
4800       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4801     case Op_SubVL:
4802       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4803     case Op_SubVF:
4804       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4805     case Op_SubVD:
4806       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4807     case Op_MulVS:
4808       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4809     case Op_MulVI:
4810       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4811     case Op_MulVL:
4812       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4813     case Op_MulVF:
4814       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4815     case Op_MulVD:
4816       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4817     case Op_DivVF:
4818       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4819     case Op_DivVD:
4820       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4821     case Op_SqrtVF:
4822       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4823     case Op_SqrtVD:
4824       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4825     case Op_AbsVB:
4826       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4827     case Op_AbsVS:
4828       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4829     case Op_AbsVI:
4830       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4831     case Op_AbsVL:
4832       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4833     case Op_FmaVF:
4834       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4835     case Op_FmaVD:
4836       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4837     case Op_VectorRearrange:
4838       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4839     case Op_LShiftVS:
4840       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4841     case Op_LShiftVI:
4842       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4843     case Op_LShiftVL:
4844       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4845     case Op_RShiftVS:
4846       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4847     case Op_RShiftVI:
4848       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4849     case Op_RShiftVL:
4850       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4851     case Op_URShiftVS:
4852       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4853     case Op_URShiftVI:
4854       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4855     case Op_URShiftVL:
4856       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4857     case Op_RotateLeftV:
4858       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4859     case Op_RotateRightV:
4860       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4861     case Op_MaxV:
4862       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4863     case Op_MinV:
4864       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4865     case Op_UMinV:
4866       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4867     case Op_UMaxV:
4868       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4869     case Op_XorV:
4870       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4871     case Op_OrV:
4872       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4873     case Op_AndV:
4874       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4875     default:
4876       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4877       break;
4878   }
4879 }
4880 
4881 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4882                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4883   switch (ideal_opc) {
4884     case Op_AddVB:
4885       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4886     case Op_AddVS:
4887       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4888     case Op_AddVI:
4889       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4890     case Op_AddVL:
4891       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4892     case Op_AddVF:
4893       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4894     case Op_AddVD:
4895       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4896     case Op_SubVB:
4897       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4898     case Op_SubVS:
4899       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4900     case Op_SubVI:
4901       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4902     case Op_SubVL:
4903       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4904     case Op_SubVF:
4905       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4906     case Op_SubVD:
4907       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4908     case Op_MulVS:
4909       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4910     case Op_MulVI:
4911       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4912     case Op_MulVL:
4913       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4914     case Op_MulVF:
4915       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4916     case Op_MulVD:
4917       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4918     case Op_DivVF:
4919       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4920     case Op_DivVD:
4921       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4922     case Op_FmaVF:
4923       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4924     case Op_FmaVD:
4925       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4926     case Op_MaxV:
4927       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4928     case Op_MinV:
4929       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4930     case Op_UMaxV:
4931       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4932     case Op_UMinV:
4933       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4934     case Op_XorV:
4935       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4936     case Op_OrV:
4937       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4938     case Op_AndV:
4939       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4940     default:
4941       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4942       break;
4943   }
4944 }
4945 
4946 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4947                                   KRegister src1, KRegister src2) {
4948   BasicType etype = T_ILLEGAL;
4949   switch(mask_len) {
4950     case 2:
4951     case 4:
4952     case 8:  etype = T_BYTE; break;
4953     case 16: etype = T_SHORT; break;
4954     case 32: etype = T_INT; break;
4955     case 64: etype = T_LONG; break;
4956     default: fatal("Unsupported type"); break;
4957   }
4958   assert(etype != T_ILLEGAL, "");
4959   switch(ideal_opc) {
4960     case Op_AndVMask:
4961       kand(etype, dst, src1, src2); break;
4962     case Op_OrVMask:
4963       kor(etype, dst, src1, src2); break;
4964     case Op_XorVMask:
4965       kxor(etype, dst, src1, src2); break;
4966     default:
4967       fatal("Unsupported masked operation"); break;
4968   }
4969 }
4970 
4971 /*
4972  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4973  * If src is NaN, the result is 0.
4974  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4975  * the result is equal to the value of Integer.MIN_VALUE.
4976  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4977  * the result is equal to the value of Integer.MAX_VALUE.
4978  */
4979 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4980                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4981                                                                    Register rscratch, AddressLiteral float_sign_flip,
4982                                                                    int vec_enc) {
4983   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4984   Label done;
4985   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4986   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4987   vptest(xtmp2, xtmp2, vec_enc);
4988   jccb(Assembler::equal, done);
4989 
4990   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4991   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4992 
4993   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4994   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4995   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4996 
4997   // Recompute the mask for remaining special value.
4998   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4999   // Extract SRC values corresponding to TRUE mask lanes.
5000   vpand(xtmp4, xtmp2, src, vec_enc);
5001   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
5002   // values are set.
5003   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
5004 
5005   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
5006   bind(done);
5007 }
5008 
5009 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5010                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5011                                                                     Register rscratch, AddressLiteral float_sign_flip,
5012                                                                     int vec_enc) {
5013   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5014   Label done;
5015   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5016   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5017   kortestwl(ktmp1, ktmp1);
5018   jccb(Assembler::equal, done);
5019 
5020   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5021   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5022   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5023 
5024   kxorwl(ktmp1, ktmp1, ktmp2);
5025   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5026   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5027   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5028   bind(done);
5029 }
5030 
5031 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5032                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5033                                                                      Register rscratch, AddressLiteral double_sign_flip,
5034                                                                      int vec_enc) {
5035   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5036 
5037   Label done;
5038   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5039   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
5040   kortestwl(ktmp1, ktmp1);
5041   jccb(Assembler::equal, done);
5042 
5043   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5044   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5045   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5046 
5047   kxorwl(ktmp1, ktmp1, ktmp2);
5048   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5049   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5050   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5051   bind(done);
5052 }
5053 
5054 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5055                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5056                                                                      Register rscratch, AddressLiteral float_sign_flip,
5057                                                                      int vec_enc) {
5058   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5059   Label done;
5060   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5061   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5062   kortestwl(ktmp1, ktmp1);
5063   jccb(Assembler::equal, done);
5064 
5065   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5066   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5067   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5068 
5069   kxorwl(ktmp1, ktmp1, ktmp2);
5070   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5071   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5072   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5073   bind(done);
5074 }
5075 
5076 /*
5077  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5078  * If src is NaN, the result is 0.
5079  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
5080  * the result is equal to the value of Long.MIN_VALUE.
5081  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
5082  * the result is equal to the value of Long.MAX_VALUE.
5083  */
5084 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5085                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5086                                                                       Register rscratch, AddressLiteral double_sign_flip,
5087                                                                       int vec_enc) {
5088   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5089 
5090   Label done;
5091   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5092   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
5093   kortestwl(ktmp1, ktmp1);
5094   jccb(Assembler::equal, done);
5095 
5096   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5097   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5098   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5099 
5100   kxorwl(ktmp1, ktmp1, ktmp2);
5101   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5102   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5103   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5104   bind(done);
5105 }
5106 
5107 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5108                                                              XMMRegister xtmp, int index, int vec_enc) {
5109    assert(vec_enc < Assembler::AVX_512bit, "");
5110    if (vec_enc == Assembler::AVX_256bit) {
5111      vextractf128_high(xtmp, src);
5112      vshufps(dst, src, xtmp, index, vec_enc);
5113    } else {
5114      vshufps(dst, src, zero, index, vec_enc);
5115    }
5116 }
5117 
5118 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5119                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5120                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
5121   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5122 
5123   Label done;
5124   // Compare the destination lanes with float_sign_flip
5125   // value to get mask for all special values.
5126   movdqu(xtmp1, float_sign_flip, rscratch);
5127   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5128   ptest(xtmp2, xtmp2);
5129   jccb(Assembler::equal, done);
5130 
5131   // Flip float_sign_flip to get max integer value.
5132   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5133   pxor(xtmp1, xtmp4);
5134 
5135   // Set detination lanes corresponding to unordered source lanes as zero.
5136   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5137   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5138 
5139   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5140   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5141   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5142 
5143   // Recompute the mask for remaining special value.
5144   pxor(xtmp2, xtmp3);
5145   // Extract mask corresponding to non-negative source lanes.
5146   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5147 
5148   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5149   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5150   pand(xtmp3, xtmp2);
5151 
5152   // Replace destination lanes holding special value(0x80000000) with max int
5153   // if corresponding source lane holds a +ve value.
5154   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5155   bind(done);
5156 }
5157 
5158 
5159 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5160                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5161   switch(to_elem_bt) {
5162     case T_SHORT:
5163       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5164       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5165       vpackusdw(dst, dst, zero, vec_enc);
5166       if (vec_enc == Assembler::AVX_256bit) {
5167         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5168       }
5169       break;
5170     case  T_BYTE:
5171       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5172       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5173       vpackusdw(dst, dst, zero, vec_enc);
5174       if (vec_enc == Assembler::AVX_256bit) {
5175         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5176       }
5177       vpackuswb(dst, dst, zero, vec_enc);
5178       break;
5179     default: assert(false, "Unexpected basic type for target of vector cast int to subword: %s", type2name(to_elem_bt));
5180   }
5181 }
5182 
5183 /*
5184  * Algorithm for vector D2L and F2I conversions (AVX 10.2 unsupported):-
5185  * a) Perform vector D2L/F2I cast.
5186  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5187  *    It signifies that source value could be any of the special floating point
5188  *    values(NaN,-Inf,Inf,Max,-Min).
5189  * c) Set destination to zero if source is NaN value.
5190  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5191  */
5192 
5193 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5194                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5195                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5196   int to_elem_sz = type2aelembytes(to_elem_bt);
5197   assert(to_elem_sz <= 4, "");
5198   vcvttps2dq(dst, src, vec_enc);
5199   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5200   if (to_elem_sz < 4) {
5201     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5202     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5203   }
5204 }
5205 
5206 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5207                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5208                                             Register rscratch, int vec_enc) {
5209   int to_elem_sz = type2aelembytes(to_elem_bt);
5210   assert(to_elem_sz <= 4, "");
5211   vcvttps2dq(dst, src, vec_enc);
5212   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5213   switch(to_elem_bt) {
5214     case T_INT:
5215       break;
5216     case T_SHORT:
5217       evpmovdw(dst, dst, vec_enc);
5218       break;
5219     case T_BYTE:
5220       evpmovdb(dst, dst, vec_enc);
5221       break;
5222     default: assert(false, "Unexpected basic type for target of vector castF2X EVEX: %s", type2name(to_elem_bt));
5223   }
5224 }
5225 
5226 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5227                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5228                                             Register rscratch, int vec_enc) {
5229   evcvttps2qq(dst, src, vec_enc);
5230   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5231 }
5232 
5233 // Handling for downcasting from double to integer or sub-word types on AVX2.
5234 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5235                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5236                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5237   int to_elem_sz = type2aelembytes(to_elem_bt);
5238   assert(to_elem_sz < 8, "");
5239   vcvttpd2dq(dst, src, vec_enc);
5240   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5241                                               float_sign_flip, vec_enc);
5242   if (to_elem_sz < 4) {
5243     // xtmp4 holds all zero lanes.
5244     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5245   }
5246 }
5247 
5248 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5249                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5250                                             KRegister ktmp2, AddressLiteral sign_flip,
5251                                             Register rscratch, int vec_enc) {
5252   if (VM_Version::supports_avx512dq()) {
5253     evcvttpd2qq(dst, src, vec_enc);
5254     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5255     switch(to_elem_bt) {
5256       case T_LONG:
5257         break;
5258       case T_INT:
5259         evpmovsqd(dst, dst, vec_enc);
5260         break;
5261       case T_SHORT:
5262         evpmovsqd(dst, dst, vec_enc);
5263         evpmovdw(dst, dst, vec_enc);
5264         break;
5265       case T_BYTE:
5266         evpmovsqd(dst, dst, vec_enc);
5267         evpmovdb(dst, dst, vec_enc);
5268         break;
5269       default: assert(false, "Unexpected basic type for target of vector castD2X AVX512DQ EVEX: %s", type2name(to_elem_bt));
5270     }
5271   } else {
5272     assert(type2aelembytes(to_elem_bt) <= 4, "");
5273     vcvttpd2dq(dst, src, vec_enc);
5274     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5275     switch(to_elem_bt) {
5276       case T_INT:
5277         break;
5278       case T_SHORT:
5279         evpmovdw(dst, dst, vec_enc);
5280         break;
5281       case T_BYTE:
5282         evpmovdb(dst, dst, vec_enc);
5283         break;
5284       default: assert(false, "Unexpected basic type for target of vector castD2X EVEX: %s", type2name(to_elem_bt));
5285     }
5286   }
5287 }
5288 
5289 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5290   switch(to_elem_bt) {
5291     case T_LONG:
5292       evcvttps2qqs(dst, src, vec_enc);
5293       break;
5294     case T_INT:
5295       evcvttps2dqs(dst, src, vec_enc);
5296       break;
5297     case T_SHORT:
5298       evcvttps2dqs(dst, src, vec_enc);
5299       evpmovdw(dst, dst, vec_enc);
5300       break;
5301     case T_BYTE:
5302       evcvttps2dqs(dst, src, vec_enc);
5303       evpmovdb(dst, dst, vec_enc);
5304       break;
5305     default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (reg src): %s", type2name(to_elem_bt));
5306   }
5307 }
5308 
5309 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5310   switch(to_elem_bt) {
5311     case T_LONG:
5312       evcvttps2qqs(dst, src, vec_enc);
5313       break;
5314     case T_INT:
5315       evcvttps2dqs(dst, src, vec_enc);
5316       break;
5317     case T_SHORT:
5318       evcvttps2dqs(dst, src, vec_enc);
5319       evpmovdw(dst, dst, vec_enc);
5320       break;
5321     case T_BYTE:
5322       evcvttps2dqs(dst, src, vec_enc);
5323       evpmovdb(dst, dst, vec_enc);
5324       break;
5325     default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (mem src): %s", type2name(to_elem_bt));
5326   }
5327 }
5328 
5329 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5330   switch(to_elem_bt) {
5331     case T_LONG:
5332       evcvttpd2qqs(dst, src, vec_enc);
5333       break;
5334     case T_INT:
5335       evcvttpd2dqs(dst, src, vec_enc);
5336       break;
5337     case T_SHORT:
5338       evcvttpd2dqs(dst, src, vec_enc);
5339       evpmovdw(dst, dst, vec_enc);
5340       break;
5341     case T_BYTE:
5342       evcvttpd2dqs(dst, src, vec_enc);
5343       evpmovdb(dst, dst, vec_enc);
5344       break;
5345     default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (reg src): %s", type2name(to_elem_bt));
5346   }
5347 }
5348 
5349 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5350   switch(to_elem_bt) {
5351     case T_LONG:
5352       evcvttpd2qqs(dst, src, vec_enc);
5353       break;
5354     case T_INT:
5355       evcvttpd2dqs(dst, src, vec_enc);
5356       break;
5357     case T_SHORT:
5358       evcvttpd2dqs(dst, src, vec_enc);
5359       evpmovdw(dst, dst, vec_enc);
5360       break;
5361     case T_BYTE:
5362       evcvttpd2dqs(dst, src, vec_enc);
5363       evpmovdb(dst, dst, vec_enc);
5364       break;
5365     default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (mem src): %s", type2name(to_elem_bt));
5366   }
5367 }
5368 
5369 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5370                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5371                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5372   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5373   // and re-instantiate original MXCSR.RC mode after that.
5374   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5375 
5376   mov64(tmp, julong_cast(0.5L));
5377   evpbroadcastq(xtmp1, tmp, vec_enc);
5378   vaddpd(xtmp1, src , xtmp1, vec_enc);
5379   evcvtpd2qq(dst, xtmp1, vec_enc);
5380   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5381                                                 double_sign_flip, vec_enc);;
5382 
5383   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5384 }
5385 
5386 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5387                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5388                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5389   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5390   // and re-instantiate original MXCSR.RC mode after that.
5391   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5392 
5393   movl(tmp, jint_cast(0.5));
5394   movq(xtmp1, tmp);
5395   vbroadcastss(xtmp1, xtmp1, vec_enc);
5396   vaddps(xtmp1, src , xtmp1, vec_enc);
5397   vcvtps2dq(dst, xtmp1, vec_enc);
5398   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5399                                               float_sign_flip, vec_enc);
5400 
5401   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5402 }
5403 
5404 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5405                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5406                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5407   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5408   // and re-instantiate original MXCSR.RC mode after that.
5409   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5410 
5411   movl(tmp, jint_cast(0.5));
5412   movq(xtmp1, tmp);
5413   vbroadcastss(xtmp1, xtmp1, vec_enc);
5414   vaddps(xtmp1, src , xtmp1, vec_enc);
5415   vcvtps2dq(dst, xtmp1, vec_enc);
5416   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5417 
5418   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5419 }
5420 
5421 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5422                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5423   switch (from_elem_bt) {
5424     case T_BYTE:
5425       switch (to_elem_bt) {
5426         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5427         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5428         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5429         default: ShouldNotReachHere();
5430       }
5431       break;
5432     case T_SHORT:
5433       switch (to_elem_bt) {
5434         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5435         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5436         default: ShouldNotReachHere();
5437       }
5438       break;
5439     case T_INT:
5440       assert(to_elem_bt == T_LONG, "");
5441       vpmovzxdq(dst, src, vlen_enc);
5442       break;
5443     default:
5444       ShouldNotReachHere();
5445   }
5446 }
5447 
5448 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5449                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5450   switch (from_elem_bt) {
5451     case T_BYTE:
5452       switch (to_elem_bt) {
5453         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5454         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5455         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5456         default: ShouldNotReachHere();
5457       }
5458       break;
5459     case T_SHORT:
5460       switch (to_elem_bt) {
5461         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5462         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5463         default: ShouldNotReachHere();
5464       }
5465       break;
5466     case T_INT:
5467       assert(to_elem_bt == T_LONG, "");
5468       vpmovsxdq(dst, src, vlen_enc);
5469       break;
5470     default:
5471       ShouldNotReachHere();
5472   }
5473 }
5474 
5475 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5476                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5477   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5478   assert(vlen_enc != AVX_512bit, "");
5479 
5480   int dst_bt_size = type2aelembytes(dst_bt);
5481   int src_bt_size = type2aelembytes(src_bt);
5482   if (dst_bt_size > src_bt_size) {
5483     switch (dst_bt_size / src_bt_size) {
5484       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5485       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5486       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5487       default: ShouldNotReachHere();
5488     }
5489   } else {
5490     assert(dst_bt_size < src_bt_size, "");
5491     switch (src_bt_size / dst_bt_size) {
5492       case 2: {
5493         if (vlen_enc == AVX_128bit) {
5494           vpacksswb(dst, src, src, vlen_enc);
5495         } else {
5496           vpacksswb(dst, src, src, vlen_enc);
5497           vpermq(dst, dst, 0x08, vlen_enc);
5498         }
5499         break;
5500       }
5501       case 4: {
5502         if (vlen_enc == AVX_128bit) {
5503           vpackssdw(dst, src, src, vlen_enc);
5504           vpacksswb(dst, dst, dst, vlen_enc);
5505         } else {
5506           vpackssdw(dst, src, src, vlen_enc);
5507           vpermq(dst, dst, 0x08, vlen_enc);
5508           vpacksswb(dst, dst, dst, AVX_128bit);
5509         }
5510         break;
5511       }
5512       case 8: {
5513         if (vlen_enc == AVX_128bit) {
5514           vpshufd(dst, src, 0x08, vlen_enc);
5515           vpackssdw(dst, dst, dst, vlen_enc);
5516           vpacksswb(dst, dst, dst, vlen_enc);
5517         } else {
5518           vpshufd(dst, src, 0x08, vlen_enc);
5519           vpermq(dst, dst, 0x08, vlen_enc);
5520           vpackssdw(dst, dst, dst, AVX_128bit);
5521           vpacksswb(dst, dst, dst, AVX_128bit);
5522         }
5523         break;
5524       }
5525       default: ShouldNotReachHere();
5526     }
5527   }
5528 }
5529 
5530 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5531                                    bool merge, BasicType bt, int vlen_enc) {
5532   if (bt == T_INT) {
5533     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5534   } else {
5535     assert(bt == T_LONG, "");
5536     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5537   }
5538 }
5539 
5540 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5541                                    bool merge, BasicType bt, int vlen_enc) {
5542   if (bt == T_INT) {
5543     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5544   } else {
5545     assert(bt == T_LONG, "");
5546     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5547   }
5548 }
5549 
5550 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5551                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5552                                                int vec_enc) {
5553   int index = 0;
5554   int vindex = 0;
5555   mov64(rtmp1, 0x0101010101010101L);
5556   pdepq(rtmp1, src, rtmp1);
5557   if (mask_len > 8) {
5558     movq(rtmp2, src);
5559     vpxor(xtmp, xtmp, xtmp, vec_enc);
5560     movq(xtmp, rtmp1);
5561   }
5562   movq(dst, rtmp1);
5563 
5564   mask_len -= 8;
5565   while (mask_len > 0) {
5566     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5567     index++;
5568     if ((index % 2) == 0) {
5569       pxor(xtmp, xtmp);
5570     }
5571     mov64(rtmp1, 0x0101010101010101L);
5572     shrq(rtmp2, 8);
5573     pdepq(rtmp1, rtmp2, rtmp1);
5574     pinsrq(xtmp, rtmp1, index % 2);
5575     vindex = index / 2;
5576     if (vindex) {
5577       // Write entire 16 byte vector when both 64 bit
5578       // lanes are update to save redundant instructions.
5579       if (index % 2) {
5580         vinsertf128(dst, dst, xtmp, vindex);
5581       }
5582     } else {
5583       vmovdqu(dst, xtmp);
5584     }
5585     mask_len -= 8;
5586   }
5587 }
5588 
5589 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5590   switch(opc) {
5591     case Op_VectorMaskTrueCount:
5592       popcntq(dst, tmp);
5593       break;
5594     case Op_VectorMaskLastTrue:
5595       if (VM_Version::supports_lzcnt()) {
5596         lzcntq(tmp, tmp);
5597         movl(dst, 63);
5598         subl(dst, tmp);
5599       } else {
5600         movl(dst, -1);
5601         bsrq(tmp, tmp);
5602         cmov32(Assembler::notZero, dst, tmp);
5603       }
5604       break;
5605     case Op_VectorMaskFirstTrue:
5606       if (VM_Version::supports_bmi1()) {
5607         if (masklen < 32) {
5608           orl(tmp, 1 << masklen);
5609           tzcntl(dst, tmp);
5610         } else if (masklen == 32) {
5611           tzcntl(dst, tmp);
5612         } else {
5613           assert(masklen == 64, "");
5614           tzcntq(dst, tmp);
5615         }
5616       } else {
5617         if (masklen < 32) {
5618           orl(tmp, 1 << masklen);
5619           bsfl(dst, tmp);
5620         } else {
5621           assert(masklen == 32 || masklen == 64, "");
5622           movl(dst, masklen);
5623           if (masklen == 32)  {
5624             bsfl(tmp, tmp);
5625           } else {
5626             bsfq(tmp, tmp);
5627           }
5628           cmov32(Assembler::notZero, dst, tmp);
5629         }
5630       }
5631       break;
5632     case Op_VectorMaskToLong:
5633       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5634       break;
5635     default: assert(false, "Unhandled mask operation");
5636   }
5637 }
5638 
5639 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5640                                               int masklen, int masksize, int vec_enc) {
5641   assert(VM_Version::supports_popcnt(), "");
5642 
5643   if(VM_Version::supports_avx512bw()) {
5644     kmovql(tmp, mask);
5645   } else {
5646     assert(masklen <= 16, "");
5647     kmovwl(tmp, mask);
5648   }
5649 
5650   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5651   // operations needs to be clipped.
5652   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5653     andq(tmp, (1 << masklen) - 1);
5654   }
5655 
5656   vector_mask_operation_helper(opc, dst, tmp, masklen);
5657 }
5658 
5659 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5660                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5661   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5662          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5663   assert(VM_Version::supports_popcnt(), "");
5664 
5665   bool need_clip = false;
5666   switch(bt) {
5667     case T_BOOLEAN:
5668       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5669       vpxor(xtmp, xtmp, xtmp, vec_enc);
5670       vpsubb(xtmp, xtmp, mask, vec_enc);
5671       vpmovmskb(tmp, xtmp, vec_enc);
5672       need_clip = masklen < 16;
5673       break;
5674     case T_BYTE:
5675       vpmovmskb(tmp, mask, vec_enc);
5676       need_clip = masklen < 16;
5677       break;
5678     case T_SHORT:
5679       vpacksswb(xtmp, mask, mask, vec_enc);
5680       if (masklen >= 16) {
5681         vpermpd(xtmp, xtmp, 8, vec_enc);
5682       }
5683       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5684       need_clip = masklen < 16;
5685       break;
5686     case T_INT:
5687     case T_FLOAT:
5688       vmovmskps(tmp, mask, vec_enc);
5689       need_clip = masklen < 4;
5690       break;
5691     case T_LONG:
5692     case T_DOUBLE:
5693       vmovmskpd(tmp, mask, vec_enc);
5694       need_clip = masklen < 2;
5695       break;
5696     default: assert(false, "Unhandled type, %s", type2name(bt));
5697   }
5698 
5699   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5700   // operations needs to be clipped.
5701   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5702     // need_clip implies masklen < 32
5703     andq(tmp, (1 << masklen) - 1);
5704   }
5705 
5706   vector_mask_operation_helper(opc, dst, tmp, masklen);
5707 }
5708 
5709 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5710                                              Register rtmp2, int mask_len) {
5711   kmov(rtmp1, src);
5712   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5713   mov64(rtmp2, -1L);
5714   pextq(rtmp2, rtmp2, rtmp1);
5715   kmov(dst, rtmp2);
5716 }
5717 
5718 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5719                                                     XMMRegister mask, Register rtmp, Register rscratch,
5720                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5721                                                     int vec_enc) {
5722   assert(type2aelembytes(bt) >= 4, "");
5723   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5724   address compress_perm_table = nullptr;
5725   address expand_perm_table = nullptr;
5726   if (type2aelembytes(bt) == 8) {
5727     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5728     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5729     vmovmskpd(rtmp, mask, vec_enc);
5730   } else {
5731     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5732     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5733     vmovmskps(rtmp, mask, vec_enc);
5734   }
5735   shlq(rtmp, 5); // for 32 byte permute row.
5736   if (opcode == Op_CompressV) {
5737     lea(rscratch, ExternalAddress(compress_perm_table));
5738   } else {
5739     lea(rscratch, ExternalAddress(expand_perm_table));
5740   }
5741   addptr(rtmp, rscratch);
5742   vmovdqu(permv, Address(rtmp));
5743   vpermps(dst, permv, src, Assembler::AVX_256bit);
5744   vpxor(xtmp, xtmp, xtmp, vec_enc);
5745   // Blend the result with zero vector using permute mask, each column entry
5746   // in a permute table row contains either a valid permute index or a -1 (default)
5747   // value, this can potentially be used as a blending mask after
5748   // compressing/expanding the source vector lanes.
5749   vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv);
5750 }
5751 
5752 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5753                                                bool merge, BasicType bt, int vec_enc) {
5754   if (opcode == Op_CompressV) {
5755     switch(bt) {
5756     case T_BYTE:
5757       evpcompressb(dst, mask, src, merge, vec_enc);
5758       break;
5759     case T_CHAR:
5760     case T_SHORT:
5761       evpcompressw(dst, mask, src, merge, vec_enc);
5762       break;
5763     case T_INT:
5764       evpcompressd(dst, mask, src, merge, vec_enc);
5765       break;
5766     case T_FLOAT:
5767       evcompressps(dst, mask, src, merge, vec_enc);
5768       break;
5769     case T_LONG:
5770       evpcompressq(dst, mask, src, merge, vec_enc);
5771       break;
5772     case T_DOUBLE:
5773       evcompresspd(dst, mask, src, merge, vec_enc);
5774       break;
5775     default:
5776       fatal("Unsupported type %s", type2name(bt));
5777       break;
5778     }
5779   } else {
5780     assert(opcode == Op_ExpandV, "");
5781     switch(bt) {
5782     case T_BYTE:
5783       evpexpandb(dst, mask, src, merge, vec_enc);
5784       break;
5785     case T_CHAR:
5786     case T_SHORT:
5787       evpexpandw(dst, mask, src, merge, vec_enc);
5788       break;
5789     case T_INT:
5790       evpexpandd(dst, mask, src, merge, vec_enc);
5791       break;
5792     case T_FLOAT:
5793       evexpandps(dst, mask, src, merge, vec_enc);
5794       break;
5795     case T_LONG:
5796       evpexpandq(dst, mask, src, merge, vec_enc);
5797       break;
5798     case T_DOUBLE:
5799       evexpandpd(dst, mask, src, merge, vec_enc);
5800       break;
5801     default:
5802       fatal("Unsupported type %s", type2name(bt));
5803       break;
5804     }
5805   }
5806 }
5807 
5808 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5809                                            KRegister ktmp1, int vec_enc) {
5810   if (opcode == Op_SignumVD) {
5811     vsubpd(dst, zero, one, vec_enc);
5812     // if src < 0 ? -1 : 1
5813     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5814     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5815     // if src == NaN, -0.0 or 0.0 return src.
5816     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5817     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5818   } else {
5819     assert(opcode == Op_SignumVF, "");
5820     vsubps(dst, zero, one, vec_enc);
5821     // if src < 0 ? -1 : 1
5822     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5823     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5824     // if src == NaN, -0.0 or 0.0 return src.
5825     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5826     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5827   }
5828 }
5829 
5830 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5831                                           XMMRegister xtmp1, int vec_enc) {
5832   if (opcode == Op_SignumVD) {
5833     vsubpd(dst, zero, one, vec_enc);
5834     // if src < 0 ? -1 : 1
5835     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5836     // if src == NaN, -0.0 or 0.0 return src.
5837     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5838     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5839   } else {
5840     assert(opcode == Op_SignumVF, "");
5841     vsubps(dst, zero, one, vec_enc);
5842     // if src < 0 ? -1 : 1
5843     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5844     // if src == NaN, -0.0 or 0.0 return src.
5845     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5846     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5847   }
5848 }
5849 
5850 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5851   if (VM_Version::supports_avx512bw()) {
5852     if (mask_len > 32) {
5853       kmovql(dst, src);
5854     } else {
5855       kmovdl(dst, src);
5856       if (mask_len != 32) {
5857         kshiftrdl(dst, dst, 32 - mask_len);
5858       }
5859     }
5860   } else {
5861     assert(mask_len <= 16, "");
5862     kmovwl(dst, src);
5863     if (mask_len != 16) {
5864       kshiftrwl(dst, dst, 16 - mask_len);
5865     }
5866   }
5867 }
5868 
5869 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5870   int lane_size = type2aelembytes(bt);
5871   if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5872       (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) {
5873     movptr(rtmp, imm32);
5874     switch(lane_size) {
5875       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5876       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5877       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5878       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5879       fatal("Unsupported lane size %d", lane_size);
5880       break;
5881     }
5882   } else {
5883     movptr(rtmp, imm32);
5884     movq(dst, rtmp);
5885     switch(lane_size) {
5886       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5887       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5888       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5889       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5890       fatal("Unsupported lane size %d", lane_size);
5891       break;
5892     }
5893   }
5894 }
5895 
5896 //
5897 // Following is lookup table based popcount computation algorithm:-
5898 //       Index   Bit set count
5899 //     [ 0000 ->   0,
5900 //       0001 ->   1,
5901 //       0010 ->   1,
5902 //       0011 ->   2,
5903 //       0100 ->   1,
5904 //       0101 ->   2,
5905 //       0110 ->   2,
5906 //       0111 ->   3,
5907 //       1000 ->   1,
5908 //       1001 ->   2,
5909 //       1010 ->   3,
5910 //       1011 ->   3,
5911 //       1100 ->   2,
5912 //       1101 ->   3,
5913 //       1111 ->   4 ]
5914 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5915 //     shuffle indices for lookup table access.
5916 //  b. Right shift each byte of vector lane by 4 positions.
5917 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5918 //     shuffle indices for lookup table access.
5919 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5920 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5921 //     count of all the bytes of a quadword.
5922 //  f. Perform step e. for upper 128bit vector lane.
5923 //  g. Pack the bitset count of quadwords back to double word.
5924 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5925 
5926 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5927                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5928   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5929   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5930   vpsrlw(dst, src, 4, vec_enc);
5931   vpand(dst, dst, xtmp1, vec_enc);
5932   vpand(xtmp1, src, xtmp1, vec_enc);
5933   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5934   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5935   vpshufb(dst, xtmp2, dst, vec_enc);
5936   vpaddb(dst, dst, xtmp1, vec_enc);
5937 }
5938 
5939 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5940                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5941   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5942   // Following code is as per steps e,f,g and h of above algorithm.
5943   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5944   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5945   vpsadbw(dst, dst, xtmp2, vec_enc);
5946   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5947   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5948   vpackuswb(dst, xtmp1, dst, vec_enc);
5949 }
5950 
5951 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5952                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5953   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5954   // Add the popcount of upper and lower bytes of word.
5955   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5956   vpsrlw(dst, xtmp1, 8, vec_enc);
5957   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5958   vpaddw(dst, dst, xtmp1, vec_enc);
5959 }
5960 
5961 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5962                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5963   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5964   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5965   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5966 }
5967 
5968 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5969                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5970   switch(bt) {
5971     case T_LONG:
5972       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5973       break;
5974     case T_INT:
5975       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5976       break;
5977     case T_CHAR:
5978     case T_SHORT:
5979       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5980       break;
5981     case T_BYTE:
5982     case T_BOOLEAN:
5983       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5984       break;
5985     default:
5986       fatal("Unsupported type %s", type2name(bt));
5987       break;
5988   }
5989 }
5990 
5991 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5992                                                       KRegister mask, bool merge, int vec_enc) {
5993   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5994   switch(bt) {
5995     case T_LONG:
5996       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5997       evpopcntq(dst, mask, src, merge, vec_enc);
5998       break;
5999     case T_INT:
6000       assert(VM_Version::supports_avx512_vpopcntdq(), "");
6001       evpopcntd(dst, mask, src, merge, vec_enc);
6002       break;
6003     case T_CHAR:
6004     case T_SHORT:
6005       assert(VM_Version::supports_avx512_bitalg(), "");
6006       evpopcntw(dst, mask, src, merge, vec_enc);
6007       break;
6008     case T_BYTE:
6009     case T_BOOLEAN:
6010       assert(VM_Version::supports_avx512_bitalg(), "");
6011       evpopcntb(dst, mask, src, merge, vec_enc);
6012       break;
6013     default:
6014       fatal("Unsupported type %s", type2name(bt));
6015       break;
6016   }
6017 }
6018 
6019 // Bit reversal algorithm first reverses the bits of each byte followed by
6020 // a byte level reversal for multi-byte primitive types (short/int/long).
6021 // Algorithm performs a lookup table access to get reverse bit sequence
6022 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
6023 // is obtained by swapping the reverse bit sequences of upper and lower
6024 // nibble of a byte.
6025 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6026                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
6027   if (VM_Version::supports_avx512vlbw()) {
6028 
6029     // Get the reverse bit sequence of lower nibble of each byte.
6030     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
6031     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6032     evpandq(dst, xtmp2, src, vec_enc);
6033     vpshufb(dst, xtmp1, dst, vec_enc);
6034     vpsllq(dst, dst, 4, vec_enc);
6035 
6036     // Get the reverse bit sequence of upper nibble of each byte.
6037     vpandn(xtmp2, xtmp2, src, vec_enc);
6038     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6039     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6040 
6041     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6042     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6043     evporq(xtmp2, dst, xtmp2, vec_enc);
6044     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6045 
6046   } else if(vec_enc == Assembler::AVX_512bit) {
6047     // Shift based bit reversal.
6048     assert(bt == T_LONG || bt == T_INT, "");
6049 
6050     // Swap lower and upper nibble of each byte.
6051     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
6052 
6053     // Swap two least and most significant bits of each nibble.
6054     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
6055 
6056     // Swap adjacent pair of bits.
6057     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6058     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
6059 
6060     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6061     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
6062   } else {
6063     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
6064     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6065 
6066     // Get the reverse bit sequence of lower nibble of each byte.
6067     vpand(dst, xtmp2, src, vec_enc);
6068     vpshufb(dst, xtmp1, dst, vec_enc);
6069     vpsllq(dst, dst, 4, vec_enc);
6070 
6071     // Get the reverse bit sequence of upper nibble of each byte.
6072     vpandn(xtmp2, xtmp2, src, vec_enc);
6073     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6074     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6075 
6076     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6077     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6078     vpor(xtmp2, dst, xtmp2, vec_enc);
6079     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6080   }
6081 }
6082 
6083 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
6084                                                 XMMRegister xtmp, Register rscratch) {
6085   assert(VM_Version::supports_gfni(), "");
6086   assert(rscratch != noreg || always_reachable(mask), "missing");
6087 
6088   // Galois field instruction based bit reversal based on following algorithm.
6089   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6090   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
6091   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
6092   vector_reverse_byte(bt, dst, xtmp, vec_enc);
6093 }
6094 
6095 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
6096                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
6097   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
6098   evpandq(dst, xtmp1, src, vec_enc);
6099   vpsllq(dst, dst, nbits, vec_enc);
6100   vpandn(xtmp1, xtmp1, src, vec_enc);
6101   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
6102   evporq(dst, dst, xtmp1, vec_enc);
6103 }
6104 
6105 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6106                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
6107   // Shift based bit reversal.
6108   assert(VM_Version::supports_evex(), "");
6109   switch(bt) {
6110     case T_LONG:
6111       // Swap upper and lower double word of each quad word.
6112       evprorq(xtmp1, k0, src, 32, true, vec_enc);
6113       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
6114       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6115       break;
6116     case T_INT:
6117       // Swap upper and lower word of each double word.
6118       evprord(xtmp1, k0, src, 16, true, vec_enc);
6119       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6120       break;
6121     case T_CHAR:
6122     case T_SHORT:
6123       // Swap upper and lower byte of each word.
6124       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6125       break;
6126     case T_BYTE:
6127       evmovdquq(dst, k0, src, true, vec_enc);
6128       break;
6129     default:
6130       fatal("Unsupported type %s", type2name(bt));
6131       break;
6132   }
6133 }
6134 
6135 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6136   if (bt == T_BYTE) {
6137     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6138       evmovdquq(dst, k0, src, true, vec_enc);
6139     } else {
6140       vmovdqu(dst, src);
6141     }
6142     return;
6143   }
6144   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6145   // pre-computed shuffle indices.
6146   switch(bt) {
6147     case T_LONG:
6148       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6149       break;
6150     case T_INT:
6151       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6152       break;
6153     case T_CHAR:
6154     case T_SHORT:
6155       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6156       break;
6157     default:
6158       fatal("Unsupported type %s", type2name(bt));
6159       break;
6160   }
6161   vpshufb(dst, src, dst, vec_enc);
6162 }
6163 
6164 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6165                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6166                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6167   assert(is_integral_type(bt), "");
6168   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6169   assert(VM_Version::supports_avx512cd(), "");
6170   switch(bt) {
6171     case T_LONG:
6172       evplzcntq(dst, ktmp, src, merge, vec_enc);
6173       break;
6174     case T_INT:
6175       evplzcntd(dst, ktmp, src, merge, vec_enc);
6176       break;
6177     case T_SHORT:
6178       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6179       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6180       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6181       vpunpckhwd(dst, xtmp1, src, vec_enc);
6182       evplzcntd(dst, ktmp, dst, merge, vec_enc);
6183       vpackusdw(dst, xtmp2, dst, vec_enc);
6184       break;
6185     case T_BYTE:
6186       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6187       // accessing the lookup table.
6188       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6189       // accessing the lookup table.
6190       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6191       assert(VM_Version::supports_avx512bw(), "");
6192       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6193       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6194       vpand(xtmp2, dst, src, vec_enc);
6195       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6196       vpsrlw(xtmp3, src, 4, vec_enc);
6197       vpand(xtmp3, dst, xtmp3, vec_enc);
6198       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6199       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6200       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6201       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6202       break;
6203     default:
6204       fatal("Unsupported type %s", type2name(bt));
6205       break;
6206   }
6207 }
6208 
6209 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6210                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6211   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6212   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6213   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6214   // accessing the lookup table.
6215   vpand(dst, xtmp2, src, vec_enc);
6216   vpshufb(dst, xtmp1, dst, vec_enc);
6217   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6218   // accessing the lookup table.
6219   vpsrlw(xtmp3, src, 4, vec_enc);
6220   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6221   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6222   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6223   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6224   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6225   vpaddb(dst, dst, xtmp2, vec_enc);
6226   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6227 }
6228 
6229 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6230                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6231   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6232   // Add zero counts of lower byte and upper byte of a word if
6233   // upper byte holds a zero value.
6234   vpsrlw(xtmp3, src, 8, vec_enc);
6235   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6236   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6237   vpsllw(xtmp2, dst, 8, vec_enc);
6238   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6239   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6240   vpsrlw(dst, dst, 8, vec_enc);
6241 }
6242 
6243 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6244                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6245   // By converting the integer to a float, we can obtain the number of leading zeros based on the exponent of the float.
6246   // As the float exponent contains a bias of 127 for nonzero values, the bias must be removed before interpreting the
6247   // exponent as the leading zero count.
6248 
6249   // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher
6250   // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit
6251   // contributes to the leading number of zeros.
6252   vpsrld(dst, src, 1, vec_enc);
6253   vpandn(dst, dst, src, vec_enc);
6254 
6255   vcvtdq2ps(dst, dst, vec_enc);
6256 
6257   // By comparing the register to itself, all the bits in the destination are set.
6258   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6259 
6260   // Move the biased exponent to the low end of the lane and mask with 0xFF to discard the sign bit.
6261   vpsrld(xtmp2, xtmp1, 24, vec_enc);
6262   vpsrld(dst, dst, 23, vec_enc);
6263   vpand(dst, xtmp2, dst, vec_enc);
6264 
6265   // Subtract 127 from the exponent, which removes the bias from the exponent.
6266   vpsrld(xtmp2, xtmp1, 25, vec_enc);
6267   vpsubd(dst, dst, xtmp2, vec_enc);
6268 
6269   vpsrld(xtmp2, xtmp1, 27, vec_enc);
6270 
6271   // If the original value is 0 the exponent would not have bias, so the subtraction creates a negative number. If this
6272   // is found in any of the lanes, replace the lane with -1 from xtmp1.
6273   vblendvps(dst, dst, xtmp1, dst, vec_enc, true, xtmp3);
6274 
6275   // If the original value is negative, replace the lane with 31.
6276   vblendvps(dst, dst, xtmp2, src, vec_enc, true, xtmp3);
6277 
6278   // Subtract the exponent from 31, giving the final result. For 0, the result is 32 as the exponent was replaced with -1,
6279   // and for negative numbers the result is 0 as the exponent was replaced with 31.
6280   vpsubd(dst, xtmp2, dst, vec_enc);
6281 }
6282 
6283 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6284                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6285   // Find the leading zeros of the top and bottom halves of the long individually.
6286   vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6287 
6288   // Move the top half result to the bottom half of xtmp1, setting the top half to 0.
6289   vpsrlq(xtmp1, dst, 32, vec_enc);
6290   // By moving the top half result to the right by 6 bits, if the top half was empty (i.e. 32 is returned) the result bit will
6291   // be in the most significant position of the bottom half.
6292   vpsrlq(xtmp2, dst, 6, vec_enc);
6293 
6294   // In the bottom half, add the top half and bottom half results.
6295   vpaddq(dst, xtmp1, dst, vec_enc);
6296 
6297   // For the bottom half, choose between the values using the most significant bit of xtmp2.
6298   // If the MSB is set, then bottom+top in dst is the resulting value. If the top half is less than 32 xtmp1 is chosen,
6299   // which contains only the top half result.
6300   // In the top half the MSB is always zero, so the value in xtmp1 is always chosen. This value is always 0, which clears
6301   // the lane as required.
6302   vblendvps(dst, xtmp1, dst, xtmp2, vec_enc, true, xtmp3);
6303 }
6304 
6305 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6306                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6307                                                        Register rtmp, int vec_enc) {
6308   assert(is_integral_type(bt), "unexpected type");
6309   assert(vec_enc < Assembler::AVX_512bit, "");
6310   switch(bt) {
6311     case T_LONG:
6312       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6313       break;
6314     case T_INT:
6315       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6316       break;
6317     case T_SHORT:
6318       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6319       break;
6320     case T_BYTE:
6321       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6322       break;
6323     default:
6324       fatal("Unsupported type %s", type2name(bt));
6325       break;
6326   }
6327 }
6328 
6329 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6330   switch(bt) {
6331     case T_BYTE:
6332       vpsubb(dst, src1, src2, vec_enc);
6333       break;
6334     case T_SHORT:
6335       vpsubw(dst, src1, src2, vec_enc);
6336       break;
6337     case T_INT:
6338       vpsubd(dst, src1, src2, vec_enc);
6339       break;
6340     case T_LONG:
6341       vpsubq(dst, src1, src2, vec_enc);
6342       break;
6343     default:
6344       fatal("Unsupported type %s", type2name(bt));
6345       break;
6346   }
6347 }
6348 
6349 // Trailing zero count computation is based on leading zero count operation as per
6350 // following equation. All AVX3 targets support AVX512CD feature which offers
6351 // direct vector instruction to compute leading zero count.
6352 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6353 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6354                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6355                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6356   assert(is_integral_type(bt), "");
6357   // xtmp = -1
6358   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6359   // xtmp = xtmp + src
6360   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6361   // xtmp = xtmp & ~src
6362   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6363   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6364   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6365   vpsub(bt, dst, xtmp4, dst, vec_enc);
6366 }
6367 
6368 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6369 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6370 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6371                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6372   assert(is_integral_type(bt), "");
6373   // xtmp = 0
6374   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6375   // xtmp = 0 - src
6376   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6377   // xtmp = xtmp | src
6378   vpor(xtmp3, xtmp3, src, vec_enc);
6379   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6380   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6381   vpsub(bt, dst, xtmp1, dst, vec_enc);
6382 }
6383 
6384 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6385   Label done;
6386   Label neg_divisor_fastpath;
6387   cmpl(divisor, 0);
6388   jccb(Assembler::less, neg_divisor_fastpath);
6389   xorl(rdx, rdx);
6390   divl(divisor);
6391   jmpb(done);
6392   bind(neg_divisor_fastpath);
6393   // Fastpath for divisor < 0:
6394   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6395   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6396   movl(rdx, rax);
6397   subl(rdx, divisor);
6398   if (VM_Version::supports_bmi1()) {
6399     andnl(rax, rdx, rax);
6400   } else {
6401     notl(rdx);
6402     andl(rax, rdx);
6403   }
6404   shrl(rax, 31);
6405   bind(done);
6406 }
6407 
6408 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6409   Label done;
6410   Label neg_divisor_fastpath;
6411   cmpl(divisor, 0);
6412   jccb(Assembler::less, neg_divisor_fastpath);
6413   xorl(rdx, rdx);
6414   divl(divisor);
6415   jmpb(done);
6416   bind(neg_divisor_fastpath);
6417   // Fastpath when divisor < 0:
6418   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6419   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6420   movl(rdx, rax);
6421   subl(rax, divisor);
6422   if (VM_Version::supports_bmi1()) {
6423     andnl(rax, rax, rdx);
6424   } else {
6425     notl(rax);
6426     andl(rax, rdx);
6427   }
6428   sarl(rax, 31);
6429   andl(rax, divisor);
6430   subl(rdx, rax);
6431   bind(done);
6432 }
6433 
6434 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6435   Label done;
6436   Label neg_divisor_fastpath;
6437 
6438   cmpl(divisor, 0);
6439   jccb(Assembler::less, neg_divisor_fastpath);
6440   xorl(rdx, rdx);
6441   divl(divisor);
6442   jmpb(done);
6443   bind(neg_divisor_fastpath);
6444   // Fastpath for divisor < 0:
6445   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6446   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6447   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6448   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6449   movl(rdx, rax);
6450   subl(rax, divisor);
6451   if (VM_Version::supports_bmi1()) {
6452     andnl(rax, rax, rdx);
6453   } else {
6454     notl(rax);
6455     andl(rax, rdx);
6456   }
6457   movl(tmp, rax);
6458   shrl(rax, 31); // quotient
6459   sarl(tmp, 31);
6460   andl(tmp, divisor);
6461   subl(rdx, tmp); // remainder
6462   bind(done);
6463 }
6464 
6465 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6466                                  XMMRegister xtmp2, Register rtmp) {
6467   if(VM_Version::supports_gfni()) {
6468     // Galois field instruction based bit reversal based on following algorithm.
6469     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6470     mov64(rtmp, 0x8040201008040201L);
6471     movq(xtmp1, src);
6472     movq(xtmp2, rtmp);
6473     gf2p8affineqb(xtmp1, xtmp2, 0);
6474     movq(dst, xtmp1);
6475   } else {
6476     // Swap even and odd numbered bits.
6477     movl(rtmp, src);
6478     andl(rtmp, 0x55555555);
6479     shll(rtmp, 1);
6480     movl(dst, src);
6481     andl(dst, 0xAAAAAAAA);
6482     shrl(dst, 1);
6483     orl(dst, rtmp);
6484 
6485     // Swap LSB and MSB 2 bits of each nibble.
6486     movl(rtmp, dst);
6487     andl(rtmp, 0x33333333);
6488     shll(rtmp, 2);
6489     andl(dst, 0xCCCCCCCC);
6490     shrl(dst, 2);
6491     orl(dst, rtmp);
6492 
6493     // Swap LSB and MSB 4 bits of each byte.
6494     movl(rtmp, dst);
6495     andl(rtmp, 0x0F0F0F0F);
6496     shll(rtmp, 4);
6497     andl(dst, 0xF0F0F0F0);
6498     shrl(dst, 4);
6499     orl(dst, rtmp);
6500   }
6501   bswapl(dst);
6502 }
6503 
6504 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6505                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6506   if(VM_Version::supports_gfni()) {
6507     // Galois field instruction based bit reversal based on following algorithm.
6508     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6509     mov64(rtmp1, 0x8040201008040201L);
6510     movq(xtmp1, src);
6511     movq(xtmp2, rtmp1);
6512     gf2p8affineqb(xtmp1, xtmp2, 0);
6513     movq(dst, xtmp1);
6514   } else {
6515     // Swap even and odd numbered bits.
6516     movq(rtmp1, src);
6517     mov64(rtmp2, 0x5555555555555555L);
6518     andq(rtmp1, rtmp2);
6519     shlq(rtmp1, 1);
6520     movq(dst, src);
6521     notq(rtmp2);
6522     andq(dst, rtmp2);
6523     shrq(dst, 1);
6524     orq(dst, rtmp1);
6525 
6526     // Swap LSB and MSB 2 bits of each nibble.
6527     movq(rtmp1, dst);
6528     mov64(rtmp2, 0x3333333333333333L);
6529     andq(rtmp1, rtmp2);
6530     shlq(rtmp1, 2);
6531     notq(rtmp2);
6532     andq(dst, rtmp2);
6533     shrq(dst, 2);
6534     orq(dst, rtmp1);
6535 
6536     // Swap LSB and MSB 4 bits of each byte.
6537     movq(rtmp1, dst);
6538     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6539     andq(rtmp1, rtmp2);
6540     shlq(rtmp1, 4);
6541     notq(rtmp2);
6542     andq(dst, rtmp2);
6543     shrq(dst, 4);
6544     orq(dst, rtmp1);
6545   }
6546   bswapq(dst);
6547 }
6548 
6549 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6550   Label done;
6551   Label neg_divisor_fastpath;
6552   cmpq(divisor, 0);
6553   jccb(Assembler::less, neg_divisor_fastpath);
6554   xorl(rdx, rdx);
6555   divq(divisor);
6556   jmpb(done);
6557   bind(neg_divisor_fastpath);
6558   // Fastpath for divisor < 0:
6559   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6560   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6561   movq(rdx, rax);
6562   subq(rdx, divisor);
6563   if (VM_Version::supports_bmi1()) {
6564     andnq(rax, rdx, rax);
6565   } else {
6566     notq(rdx);
6567     andq(rax, rdx);
6568   }
6569   shrq(rax, 63);
6570   bind(done);
6571 }
6572 
6573 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6574   Label done;
6575   Label neg_divisor_fastpath;
6576   cmpq(divisor, 0);
6577   jccb(Assembler::less, neg_divisor_fastpath);
6578   xorq(rdx, rdx);
6579   divq(divisor);
6580   jmp(done);
6581   bind(neg_divisor_fastpath);
6582   // Fastpath when divisor < 0:
6583   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6584   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6585   movq(rdx, rax);
6586   subq(rax, divisor);
6587   if (VM_Version::supports_bmi1()) {
6588     andnq(rax, rax, rdx);
6589   } else {
6590     notq(rax);
6591     andq(rax, rdx);
6592   }
6593   sarq(rax, 63);
6594   andq(rax, divisor);
6595   subq(rdx, rax);
6596   bind(done);
6597 }
6598 
6599 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6600   Label done;
6601   Label neg_divisor_fastpath;
6602   cmpq(divisor, 0);
6603   jccb(Assembler::less, neg_divisor_fastpath);
6604   xorq(rdx, rdx);
6605   divq(divisor);
6606   jmp(done);
6607   bind(neg_divisor_fastpath);
6608   // Fastpath for divisor < 0:
6609   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6610   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6611   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6612   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6613   movq(rdx, rax);
6614   subq(rax, divisor);
6615   if (VM_Version::supports_bmi1()) {
6616     andnq(rax, rax, rdx);
6617   } else {
6618     notq(rax);
6619     andq(rax, rdx);
6620   }
6621   movq(tmp, rax);
6622   shrq(rax, 63); // quotient
6623   sarq(tmp, 63);
6624   andq(tmp, divisor);
6625   subq(rdx, tmp); // remainder
6626   bind(done);
6627 }
6628 
6629 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6630                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6631                                         int vlen_enc) {
6632   assert(VM_Version::supports_avx512bw(), "");
6633   // Byte shuffles are inlane operations and indices are determined using
6634   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6635   // normalized to index range 0-15. This makes sure that all the multiples
6636   // of an index value are placed at same relative position in 128 bit
6637   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6638   // will be 16th element in their respective 128 bit lanes.
6639   movl(rtmp, 16);
6640   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6641 
6642   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6643   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6644   // original shuffle indices and move the shuffled lanes corresponding to true
6645   // mask to destination vector.
6646   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6647   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6648   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6649 
6650   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6651   // and broadcasting second 128 bit lane.
6652   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6653   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6654   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6655   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6656   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6657 
6658   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6659   // and broadcasting third 128 bit lane.
6660   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6661   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6662   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6663   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6664   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6665 
6666   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6667   // and broadcasting third 128 bit lane.
6668   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6669   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6670   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6671   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6672   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6673 }
6674 
6675 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6676                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6677   if (vlen_enc == AVX_128bit) {
6678     vpermilps(dst, src, shuffle, vlen_enc);
6679   } else if (bt == T_INT) {
6680     vpermd(dst, shuffle, src, vlen_enc);
6681   } else {
6682     assert(bt == T_FLOAT, "");
6683     vpermps(dst, shuffle, src, vlen_enc);
6684   }
6685 }
6686 
6687 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6688   switch(opcode) {
6689     case Op_AddHF: vaddsh(dst, src1, src2); break;
6690     case Op_SubHF: vsubsh(dst, src1, src2); break;
6691     case Op_MulHF: vmulsh(dst, src1, src2); break;
6692     case Op_DivHF: vdivsh(dst, src1, src2); break;
6693     default: assert(false, "%s", NodeClassNames[opcode]); break;
6694   }
6695 }
6696 
6697 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6698   switch(elem_bt) {
6699     case T_BYTE:
6700       if (ideal_opc == Op_SaturatingAddV) {
6701         vpaddsb(dst, src1, src2, vlen_enc);
6702       } else {
6703         assert(ideal_opc == Op_SaturatingSubV, "");
6704         vpsubsb(dst, src1, src2, vlen_enc);
6705       }
6706       break;
6707     case T_SHORT:
6708       if (ideal_opc == Op_SaturatingAddV) {
6709         vpaddsw(dst, src1, src2, vlen_enc);
6710       } else {
6711         assert(ideal_opc == Op_SaturatingSubV, "");
6712         vpsubsw(dst, src1, src2, vlen_enc);
6713       }
6714       break;
6715     default:
6716       fatal("Unsupported type %s", type2name(elem_bt));
6717       break;
6718   }
6719 }
6720 
6721 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6722   switch(elem_bt) {
6723     case T_BYTE:
6724       if (ideal_opc == Op_SaturatingAddV) {
6725         vpaddusb(dst, src1, src2, vlen_enc);
6726       } else {
6727         assert(ideal_opc == Op_SaturatingSubV, "");
6728         vpsubusb(dst, src1, src2, vlen_enc);
6729       }
6730       break;
6731     case T_SHORT:
6732       if (ideal_opc == Op_SaturatingAddV) {
6733         vpaddusw(dst, src1, src2, vlen_enc);
6734       } else {
6735         assert(ideal_opc == Op_SaturatingSubV, "");
6736         vpsubusw(dst, src1, src2, vlen_enc);
6737       }
6738       break;
6739     default:
6740       fatal("Unsupported type %s", type2name(elem_bt));
6741       break;
6742   }
6743 }
6744 
6745 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6746                                                               XMMRegister src2, KRegister ktmp, int vlen_enc) {
6747   // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6748   // overflow_mask = Inp1 <u Inp2
6749   evpcmpu(elem_bt, ktmp,  src2, src1, Assembler::lt, vlen_enc);
6750   // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6751   evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6752 }
6753 
6754 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6755                                                               XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6756   // Emulate unsigned comparison using signed comparison
6757   // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6758   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6759   vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6760   vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6761 
6762   vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6763 
6764   // Res = INP1 - INP2 (non-commutative and non-associative)
6765   vpsub(elem_bt, dst, src1, src2, vlen_enc);
6766   // Res = Mask ? Zero : Res
6767   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6768   vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6769 }
6770 
6771 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6772                                                                XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6773   // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6774   // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6775   // Res = Signed Add INP1, INP2
6776   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6777   // T1 = SRC1 | SRC2
6778   vpor(xtmp1, src1, src2, vlen_enc);
6779   // Max_Unsigned = -1
6780   vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6781   // Unsigned compare:  Mask = Res <u T1
6782   evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6783   // res  = Mask ? Max_Unsigned : Res
6784   evpblend(elem_bt, dst, ktmp,  dst, xtmp2, true, vlen_enc);
6785 }
6786 
6787 //
6788 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6789 // unsigned addition operation.
6790 //    overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6791 //
6792 // We empirically determined its semantic equivalence to following reduced expression
6793 //    overflow_mask =  (a + b) <u (a | b)
6794 //
6795 // and also verified it though Alive2 solver.
6796 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6797 //
6798 
6799 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6800                                                               XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6801   // Res = Signed Add INP1, INP2
6802   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6803   // Compute T1 = INP1 | INP2
6804   vpor(xtmp3, src1, src2, vlen_enc);
6805   // T1 = Minimum signed value.
6806   vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6807   // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6808   vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6809   // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6810   vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6811   // Compute overflow detection mask = Res<1> <s T1
6812   if (elem_bt == T_INT) {
6813     vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6814   } else {
6815     assert(elem_bt == T_LONG, "");
6816     vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6817   }
6818   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6819 }
6820 
6821 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6822                                       int vlen_enc, bool xtmp2_hold_M1) {
6823   if (VM_Version::supports_avx512dq()) {
6824     evpmovq2m(ktmp, src, vlen_enc);
6825   } else {
6826     assert(VM_Version::supports_evex(), "");
6827     if (!xtmp2_hold_M1) {
6828       vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6829     }
6830     evpsraq(xtmp1, src, 63, vlen_enc);
6831     evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6832   }
6833 }
6834 
6835 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6836                                       int vlen_enc, bool xtmp2_hold_M1) {
6837   if (VM_Version::supports_avx512dq()) {
6838     evpmovd2m(ktmp, src, vlen_enc);
6839   } else {
6840     assert(VM_Version::supports_evex(), "");
6841     if (!xtmp2_hold_M1) {
6842       vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6843     }
6844     vpsrad(xtmp1, src, 31, vlen_enc);
6845     Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6846   }
6847 }
6848 
6849 
6850 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6851   if (elem_bt == T_LONG) {
6852     if (VM_Version::supports_evex()) {
6853       evpsraq(dst, src, 63, vlen_enc);
6854     } else {
6855       vpsrad(dst, src, 31, vlen_enc);
6856       vpshufd(dst, dst, 0xF5, vlen_enc);
6857     }
6858   } else {
6859     assert(elem_bt == T_INT, "");
6860     vpsrad(dst, src, 31, vlen_enc);
6861   }
6862 }
6863 
6864 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6865   if (compute_allones) {
6866     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6867       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6868     } else {
6869       vpcmpeqq(allones, allones, allones, vlen_enc);
6870     }
6871   }
6872   if (elem_bt == T_LONG) {
6873     vpsrlq(dst, allones, 1, vlen_enc);
6874   } else {
6875     assert(elem_bt == T_INT, "");
6876     vpsrld(dst, allones, 1, vlen_enc);
6877   }
6878 }
6879 
6880 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6881   if (compute_allones) {
6882     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6883       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6884     } else {
6885       vpcmpeqq(allones, allones, allones, vlen_enc);
6886     }
6887   }
6888   if (elem_bt == T_LONG) {
6889     vpsllq(dst, allones, 63, vlen_enc);
6890   } else {
6891     assert(elem_bt == T_INT, "");
6892     vpslld(dst, allones, 31, vlen_enc);
6893   }
6894 }
6895 
6896 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask,  XMMRegister src1, XMMRegister src2,
6897                                 Assembler::ComparisonPredicate cond, int vlen_enc) {
6898   switch(elem_bt) {
6899     case T_LONG:  evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6900     case T_INT:   evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6901     case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6902     case T_BYTE:  evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6903     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6904   }
6905 }
6906 
6907 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6908   switch(elem_bt) {
6909     case  T_LONG:  vpcmpgtq(dst, src1, src2, vlen_enc); break;
6910     case  T_INT:   vpcmpgtd(dst, src1, src2, vlen_enc); break;
6911     case  T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6912     case  T_BYTE:  vpcmpgtb(dst, src1, src2, vlen_enc); break;
6913     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6914   }
6915 }
6916 
6917 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6918                                            XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6919   if (elem_bt == T_LONG) {
6920     evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6921   } else {
6922     assert(elem_bt == T_INT, "");
6923     evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6924   }
6925 }
6926 
6927 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6928                                                          XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6929                                                          KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6930   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6931   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6932   // Overflow detection based on Hacker's delight section 2-13.
6933   if (ideal_opc == Op_SaturatingAddV) {
6934     // res = src1 + src2
6935     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6936     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6937     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6938     vpxor(xtmp1, dst, src1, vlen_enc);
6939     vpxor(xtmp2, dst, src2, vlen_enc);
6940     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6941   } else {
6942     assert(ideal_opc == Op_SaturatingSubV, "");
6943     // res = src1 - src2
6944     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6945     // Overflow occurs when both inputs have opposite polarity and
6946     // result polarity does not comply with first input polarity.
6947     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6948     vpxor(xtmp1, src1, src2, vlen_enc);
6949     vpxor(xtmp2, dst, src1, vlen_enc);
6950     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6951   }
6952 
6953   // Compute overflow detection mask.
6954   evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6955   // Note: xtmp1 hold -1 in all its lanes after above call.
6956 
6957   // Compute mask based on first input polarity.
6958   evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6959 
6960   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6961   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6962 
6963   // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6964   // set bits in first input polarity mask holds a min value.
6965   evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6966   // Blend destination lanes with saturated values using overflow detection mask.
6967   evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6968 }
6969 
6970 
6971 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6972                                                         XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6973                                                         XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6974   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6975   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6976   // Overflow detection based on Hacker's delight section 2-13.
6977   if (ideal_opc == Op_SaturatingAddV) {
6978     // res = src1 + src2
6979     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6980     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6981     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6982     vpxor(xtmp1, dst, src1, vlen_enc);
6983     vpxor(xtmp2, dst, src2, vlen_enc);
6984     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6985   } else {
6986     assert(ideal_opc == Op_SaturatingSubV, "");
6987     // res = src1 - src2
6988     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6989     // Overflow occurs when both inputs have opposite polarity and
6990     // result polarity does not comply with first input polarity.
6991     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6992     vpxor(xtmp1, src1, src2, vlen_enc);
6993     vpxor(xtmp2, dst, src1, vlen_enc);
6994     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6995   }
6996 
6997   // Sign-extend to compute overflow detection mask.
6998   vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
6999 
7000   vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
7001   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
7002   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
7003 
7004   // Compose saturating min/max vector using first input polarity mask.
7005   vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
7006   vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
7007 
7008   // Blend result with saturating vector using overflow detection mask.
7009   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
7010 }
7011 
7012 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7013   switch(elem_bt) {
7014     case T_BYTE:
7015       if (ideal_opc == Op_SaturatingAddV) {
7016         vpaddsb(dst, src1, src2, vlen_enc);
7017       } else {
7018         assert(ideal_opc == Op_SaturatingSubV, "");
7019         vpsubsb(dst, src1, src2, vlen_enc);
7020       }
7021       break;
7022     case T_SHORT:
7023       if (ideal_opc == Op_SaturatingAddV) {
7024         vpaddsw(dst, src1, src2, vlen_enc);
7025       } else {
7026         assert(ideal_opc == Op_SaturatingSubV, "");
7027         vpsubsw(dst, src1, src2, vlen_enc);
7028       }
7029       break;
7030     default:
7031       fatal("Unsupported type %s", type2name(elem_bt));
7032       break;
7033   }
7034 }
7035 
7036 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7037   switch(elem_bt) {
7038     case T_BYTE:
7039       if (ideal_opc == Op_SaturatingAddV) {
7040         vpaddusb(dst, src1, src2, vlen_enc);
7041       } else {
7042         assert(ideal_opc == Op_SaturatingSubV, "");
7043         vpsubusb(dst, src1, src2, vlen_enc);
7044       }
7045       break;
7046     case T_SHORT:
7047       if (ideal_opc == Op_SaturatingAddV) {
7048         vpaddusw(dst, src1, src2, vlen_enc);
7049       } else {
7050         assert(ideal_opc == Op_SaturatingSubV, "");
7051         vpsubusw(dst, src1, src2, vlen_enc);
7052       }
7053       break;
7054     default:
7055       fatal("Unsupported type %s", type2name(elem_bt));
7056       break;
7057   }
7058 }
7059 
7060 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
7061                                                      XMMRegister src2, int vlen_enc) {
7062   switch(elem_bt) {
7063     case T_BYTE:
7064       evpermi2b(dst, src1, src2, vlen_enc);
7065       break;
7066     case T_SHORT:
7067       evpermi2w(dst, src1, src2, vlen_enc);
7068       break;
7069     case T_INT:
7070       evpermi2d(dst, src1, src2, vlen_enc);
7071       break;
7072     case T_LONG:
7073       evpermi2q(dst, src1, src2, vlen_enc);
7074       break;
7075     case T_FLOAT:
7076       evpermi2ps(dst, src1, src2, vlen_enc);
7077       break;
7078     case T_DOUBLE:
7079       evpermi2pd(dst, src1, src2, vlen_enc);
7080       break;
7081     default:
7082       fatal("Unsupported type %s", type2name(elem_bt));
7083       break;
7084   }
7085 }
7086 
7087 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
7088   if (is_unsigned) {
7089     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7090   } else {
7091     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7092   }
7093 }
7094 
7095 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
7096   if (is_unsigned) {
7097     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7098   } else {
7099     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7100   }
7101 }
7102 
7103 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
7104   switch(opcode) {
7105     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7106     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7107     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7108     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7109     default: assert(false, "%s", NodeClassNames[opcode]); break;
7110   }
7111 }
7112 
7113 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7114   switch(opcode) {
7115     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7116     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7117     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7118     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7119     default: assert(false, "%s", NodeClassNames[opcode]); break;
7120   }
7121 }
7122 
7123 void C2_MacroAssembler::sminmax_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7124                                      KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) {
7125   vminmax_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit);
7126 }
7127 
7128 void C2_MacroAssembler::sminmax_fp16_avx10_2(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7129                                              KRegister ktmp) {
7130   if (opcode == Op_MaxHF) {
7131     // dst = max(src1, src2)
7132     evminmaxsh(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MAX_COMPARE_SIGN);
7133   } else {
7134     assert(opcode == Op_MinHF, "");
7135     // dst = min(src1, src2)
7136     evminmaxsh(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MIN_COMPARE_SIGN);
7137   }
7138 }
7139 
7140 void C2_MacroAssembler::vminmax_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7141                                      KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
7142   if (opcode == Op_MaxVHF || opcode == Op_MaxHF) {
7143     // Move sign bits of src2 to mask register.
7144     evpmovw2m(ktmp, src2, vlen_enc);
7145     // xtmp1 = src2 < 0 ? src2 : src1
7146     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7147     // xtmp2 = src2 < 0 ? ? src1 : src2
7148     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7149     // Idea behind above swapping is to make seconds source operand a +ve value.
7150     // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in
7151     // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction,
7152     // the second source operand, either a NaN or a valid floating-point value, is returned
7153     // dst = max(xtmp1, xtmp2)
7154     evmaxph(dst, xtmp1, xtmp2, vlen_enc);
7155     // isNaN = is_unordered_quiet(xtmp1)
7156     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7157     // Final result is same as first source if its a NaN value,
7158     // in case second operand holds a NaN value then as per above semantics
7159     // result is same as second operand.
7160     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7161   } else {
7162     assert(opcode == Op_MinVHF || opcode == Op_MinHF, "");
7163     // Move sign bits of src1 to mask register.
7164     evpmovw2m(ktmp, src1, vlen_enc);
7165     // xtmp1 = src1 < 0 ? src2 : src1
7166     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7167     // xtmp2 = src1 < 0 ? src1 : src2
7168     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7169     // Idea behind above swapping is to make seconds source operand a -ve value.
7170     // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in
7171     // the second source operand is returned.
7172     // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN
7173     // or a valid floating-point value, is written to the result.
7174     // dst = min(xtmp1, xtmp2)
7175     evminph(dst, xtmp1, xtmp2, vlen_enc);
7176     // isNaN = is_unordered_quiet(xtmp1)
7177     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7178     // Final result is same as first source if its a NaN value,
7179     // in case second operand holds a NaN value then as per above semantics
7180     // result is same as second operand.
7181     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7182   }
7183 }
7184 
7185 void C2_MacroAssembler::vminmax_fp16_avx10_2(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7186                                              KRegister ktmp, int vlen_enc) {
7187   if (opcode == Op_MaxVHF) {
7188     // dst = max(src1, src2)
7189     evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vlen_enc);
7190   } else {
7191     assert(opcode == Op_MinVHF, "");
7192     // dst = min(src1, src2)
7193     evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vlen_enc);
7194   }
7195 }
7196 
7197 void C2_MacroAssembler::vminmax_fp16_avx10_2(int opcode, XMMRegister dst, XMMRegister src1, Address src2,
7198                                              KRegister ktmp, int vlen_enc) {
7199   if (opcode == Op_MaxVHF) {
7200     // dst = max(src1, src2)
7201     evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vlen_enc);
7202   } else {
7203     assert(opcode == Op_MinVHF, "");
7204     // dst = min(src1, src2)
7205     evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vlen_enc);
7206   }
7207 }