1 /*
   2  * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "gc/shared/barrierSet.hpp"
  28 #include "gc/shared/barrierSetAssembler.hpp"
  29 #include "oops/methodData.hpp"
  30 #include "opto/c2_MacroAssembler.hpp"
  31 #include "opto/intrinsicnode.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/opcodes.hpp"
  34 #include "opto/subnode.hpp"
  35 #include "runtime/globals.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/objectMonitorTable.hpp"
  38 #include "runtime/stubRoutines.hpp"
  39 #include "runtime/synchronizer.hpp"
  40 #include "utilities/checkedCast.hpp"
  41 #include "utilities/globalDefinitions.hpp"
  42 #include "utilities/powerOfTwo.hpp"
  43 #include "utilities/sizes.hpp"
  44 
  45 #ifdef PRODUCT
  46 #define BLOCK_COMMENT(str) /* nothing */
  47 #define STOP(error) stop(error)
  48 #else
  49 #define BLOCK_COMMENT(str) block_comment(str)
  50 #define STOP(error) block_comment(error); stop(error)
  51 #endif
  52 
  53 // C2 compiled method's prolog code.
  54 // Beware! This sp_inc is NOT the same as the one mentioned in MacroAssembler::remove_frame but only the size
  55 // of the extension space + the additional copy of the return address. That means, it doesn't contain the
  56 // frame size (where the local and sp_inc are) and the saved RBP.
  57 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) {
  58   if (C->clinit_barrier_on_entry()) {
  59     assert(VM_Version::supports_fast_class_init_checks(), "sanity");
  60     assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started");
  61 
  62     Label L_skip_barrier;
  63     Register klass = rscratch1;
  64 
  65     mov_metadata(klass, C->method()->holder()->constant_encoding());
  66     clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
  67 
  68     jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
  69 
  70     bind(L_skip_barrier);
  71   }
  72 
  73   int framesize = C->output()->frame_size_in_bytes();
  74   int bangsize = C->output()->bang_size_in_bytes();
  75   bool fp_mode_24b = false;
  76   int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0;
  77 
  78   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  79 
  80   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  81   // Remove word for return addr
  82   framesize -= wordSize;
  83   stack_bang_size -= wordSize;
  84 
  85   // Calls to C2R adapters often do not accept exceptional returns.
  86   // We require that their callers must bang for them.  But be careful, because
  87   // some VM calls (such as call site linkage) can use several kilobytes of
  88   // stack.  But the stack safety zone should account for that.
  89   // See bugs 4446381, 4468289, 4497237.
  90   if (stack_bang_size > 0) {
  91     generate_stack_overflow_check(stack_bang_size);
  92 
  93     // We always push rbp, so that on return to interpreter rbp, will be
  94     // restored correctly and we can correct the stack.
  95     push(rbp);
  96 #ifdef ASSERT
  97     if (sp_inc > 0) {
  98       movl(Address(rsp, 0), badRegWordVal);
  99       movl(Address(rsp, VMRegImpl::stack_slot_size), badRegWordVal);
 100     }
 101 #endif
 102     // Save caller's stack pointer into RBP if the frame pointer is preserved.
 103     if (PreserveFramePointer) {
 104       mov(rbp, rsp);
 105     }
 106     // Remove word for ebp
 107     framesize -= wordSize;
 108 
 109     // Create frame
 110     if (framesize) {
 111       subptr(rsp, framesize);
 112     }
 113   } else {
 114     subptr(rsp, framesize);
 115 
 116     // Save RBP register now.
 117     framesize -= wordSize;
 118     movptr(Address(rsp, framesize), rbp);
 119 #ifdef ASSERT
 120     if (sp_inc > 0) {
 121       movl(Address(rsp, framesize), badRegWordVal);
 122       movl(Address(rsp, framesize + VMRegImpl::stack_slot_size), badRegWordVal);
 123     }
 124 #endif
 125     // Save caller's stack pointer into RBP if the frame pointer is preserved.
 126     if (PreserveFramePointer) {
 127       movptr(rbp, rsp);
 128       if (framesize > 0) {
 129         addptr(rbp, framesize);
 130       }
 131     }
 132   }
 133 
 134   if (C->needs_stack_repair()) {
 135     // Save stack increment just below the saved rbp (also account for fixed framesize and rbp)
 136     assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned");
 137     movptr(Address(rsp, framesize - wordSize), sp_inc + framesize);
 138   }
 139 
 140   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 141     framesize -= wordSize;
 142     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 143   }
 144 
 145 #ifdef ASSERT
 146   if (VerifyStackAtCalls) {
 147     Label L;
 148     push(rax);
 149     mov(rax, rsp);
 150     andptr(rax, StackAlignmentInBytes-1);
 151     cmpptr(rax, StackAlignmentInBytes-wordSize);
 152     pop(rax);
 153     jcc(Assembler::equal, L);
 154     STOP("Stack is not properly aligned!");
 155     bind(L);
 156   }
 157 #endif
 158 }
 159 
 160 void C2_MacroAssembler::entry_barrier() {
 161   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 162   // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 163   Label dummy_slow_path;
 164   Label dummy_continuation;
 165   Label* slow_path = &dummy_slow_path;
 166   Label* continuation = &dummy_continuation;
 167   if (!Compile::current()->output()->in_scratch_emit_size()) {
 168     // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 169     C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 170     Compile::current()->output()->add_stub(stub);
 171     slow_path = &stub->entry();
 172     continuation = &stub->continuation();
 173   }
 174   bs->nmethod_entry_barrier(this, slow_path, continuation);
 175 }
 176 
 177 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 178   switch (vlen_in_bytes) {
 179     case  4: // fall-through
 180     case  8: // fall-through
 181     case 16: return Assembler::AVX_128bit;
 182     case 32: return Assembler::AVX_256bit;
 183     case 64: return Assembler::AVX_512bit;
 184 
 185     default: {
 186       ShouldNotReachHere();
 187       return Assembler::AVX_NoVec;
 188     }
 189   }
 190 }
 191 
 192 // fast_lock and fast_unlock used by C2
 193 
 194 // Because the transitions from emitted code to the runtime
 195 // monitorenter/exit helper stubs are so slow it's critical that
 196 // we inline both the stack-locking fast path and the inflated fast path.
 197 //
 198 // See also: cmpFastLock and cmpFastUnlock.
 199 //
 200 // What follows is a specialized inline transliteration of the code
 201 // in enter() and exit(). If we're concerned about I$ bloat another
 202 // option would be to emit TrySlowEnter and TrySlowExit methods
 203 // at startup-time.  These methods would accept arguments as
 204 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 205 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 206 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 207 // In practice, however, the # of lock sites is bounded and is usually small.
 208 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 209 // if the processor uses simple bimodal branch predictors keyed by EIP
 210 // Since the helper routines would be called from multiple synchronization
 211 // sites.
 212 //
 213 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 214 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 215 // to those specialized methods.  That'd give us a mostly platform-independent
 216 // implementation that the JITs could optimize and inline at their pleasure.
 217 // Done correctly, the only time we'd need to cross to native could would be
 218 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 219 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 220 // (b) explicit barriers or fence operations.
 221 //
 222 // TODO:
 223 //
 224 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 225 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 226 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 227 //    the lock operators would typically be faster than reifying Self.
 228 //
 229 // *  Ideally I'd define the primitives as:
 230 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 231 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 232 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 233 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 234 //    Furthermore the register assignments are overconstrained, possibly resulting in
 235 //    sub-optimal code near the synchronization site.
 236 //
 237 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 238 //    Alternately, use a better sp-proximity test.
 239 //
 240 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 241 //    Either one is sufficient to uniquely identify a thread.
 242 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 243 //
 244 // *  Intrinsify notify() and notifyAll() for the common cases where the
 245 //    object is locked by the calling thread but the waitlist is empty.
 246 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 247 //
 248 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 249 //    But beware of excessive branch density on AMD Opterons.
 250 //
 251 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 252 //    or failure of the fast path.  If the fast path fails then we pass
 253 //    control to the slow path, typically in C.  In fast_lock and
 254 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 255 //    will emit a conditional branch immediately after the node.
 256 //    So we have branches to branches and lots of ICC.ZF games.
 257 //    Instead, it might be better to have C2 pass a "FailureLabel"
 258 //    into fast_lock and fast_unlock.  In the case of success, control
 259 //    will drop through the node.  ICC.ZF is undefined at exit.
 260 //    In the case of failure, the node will branch directly to the
 261 //    FailureLabel
 262 
 263 // obj: object to lock
 264 // box: on-stack box address -- KILLED
 265 // rax: tmp -- KILLED
 266 // t  : tmp -- KILLED
 267 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register rax_reg,
 268                                   Register t, Register thread) {
 269   assert(rax_reg == rax, "Used for CAS");
 270   assert_different_registers(obj, box, rax_reg, t, thread);
 271 
 272   // Handle inflated monitor.
 273   Label inflated;
 274   // Finish fast lock successfully. ZF value is irrelevant.
 275   Label locked;
 276   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 277   Label slow_path;
 278 
 279   if (UseObjectMonitorTable) {
 280     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 281     movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
 282   }
 283 
 284   if (DiagnoseSyncOnValueBasedClasses != 0) {
 285     load_klass(rax_reg, obj, t);
 286     testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 287     jcc(Assembler::notZero, slow_path);
 288   }
 289 
 290   const Register mark = t;
 291 
 292   { // Fast Lock
 293 
 294     Label push;
 295 
 296     const Register top = UseObjectMonitorTable ? rax_reg : box;
 297 
 298     // Load the mark.
 299     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 300 
 301     // Prefetch top.
 302     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 303 
 304     // Check for monitor (0b10).
 305     testptr(mark, markWord::monitor_value);
 306     jcc(Assembler::notZero, inflated);
 307 
 308     // Check if lock-stack is full.
 309     cmpl(top, LockStack::end_offset() - 1);
 310     jcc(Assembler::greater, slow_path);
 311 
 312     // Check if recursive.
 313     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 314     jccb(Assembler::equal, push);
 315 
 316     // Try to lock. Transition lock bits 0b01 => 0b00
 317     movptr(rax_reg, mark);
 318     orptr(rax_reg, markWord::unlocked_value);
 319     andptr(mark, ~(int32_t)markWord::unlocked_value);
 320     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 321     jcc(Assembler::notEqual, slow_path);
 322 
 323     if (UseObjectMonitorTable) {
 324       // Need to reload top, clobbered by CAS.
 325       movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 326     }
 327     bind(push);
 328     // After successful lock, push object on lock-stack.
 329     movptr(Address(thread, top), obj);
 330     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 331     jmp(locked);
 332   }
 333 
 334   { // Handle inflated monitor.
 335     bind(inflated);
 336 
 337     const Register monitor = t;
 338 
 339     if (!UseObjectMonitorTable) {
 340       assert(mark == monitor, "should be the same here");
 341     } else {
 342       const Register hash = t;
 343       Label monitor_found;
 344 
 345       // Look for the monitor in the om_cache.
 346 
 347       ByteSize cache_offset   = JavaThread::om_cache_oops_offset();
 348       ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
 349       const int num_unrolled  = OMCache::CAPACITY;
 350       for (int i = 0; i < num_unrolled; i++) {
 351         movptr(monitor, Address(thread,  cache_offset + monitor_offset));
 352         cmpptr(obj, Address(thread, cache_offset));
 353         jccb(Assembler::equal, monitor_found);
 354         cache_offset = cache_offset + OMCache::oop_to_oop_difference();
 355       }
 356 
 357       // Look for the monitor in the table.
 358 
 359       // Get the hash code.
 360       movptr(hash, Address(obj, oopDesc::mark_offset_in_bytes()));
 361       shrq(hash, markWord::hash_shift);
 362       andq(hash, markWord::hash_mask);
 363 
 364       // Get the table and calculate the bucket's address.
 365       lea(rax_reg, ExternalAddress(ObjectMonitorTable::current_table_address()));
 366       movptr(rax_reg, Address(rax_reg));
 367       andq(hash, Address(rax_reg, ObjectMonitorTable::table_capacity_mask_offset()));
 368       movptr(rax_reg, Address(rax_reg, ObjectMonitorTable::table_buckets_offset()));
 369 
 370       // Read the monitor from the bucket.
 371       movptr(monitor, Address(rax_reg, hash, Address::times_ptr));
 372 
 373       // Check if the monitor in the bucket is special (empty, tombstone or removed)
 374       cmpptr(monitor, ObjectMonitorTable::SpecialPointerValues::below_is_special);
 375       jcc(Assembler::below, slow_path);
 376 
 377       // Check if object matches.
 378       movptr(rax_reg, Address(monitor, ObjectMonitor::object_offset()));
 379       BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 380       bs_asm->try_resolve_weak_handle_in_c2(this, rax_reg, slow_path);
 381       cmpptr(rax_reg, obj);
 382       jcc(Assembler::notEqual, slow_path);
 383 
 384       bind(monitor_found);
 385     }
 386     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 387     const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 388     const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
 389 
 390     Label monitor_locked;
 391     // Lock the monitor.
 392 
 393     if (UseObjectMonitorTable) {
 394       // Cache the monitor for unlock before trashing box. On failure to acquire
 395       // the lock, the slow path will reset the entry accordingly (see CacheSetter).
 396       movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
 397     }
 398 
 399     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 400     xorptr(rax_reg, rax_reg);
 401     movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
 402     lock(); cmpxchgptr(box, owner_address);
 403     jccb(Assembler::equal, monitor_locked);
 404 
 405     // Check if recursive.
 406     cmpptr(box, rax_reg);
 407     jccb(Assembler::notEqual, slow_path);
 408 
 409     // Recursive.
 410     increment(recursions_address);
 411 
 412     bind(monitor_locked);
 413   }
 414 
 415   bind(locked);
 416   // Set ZF = 1
 417   xorl(rax_reg, rax_reg);
 418 
 419 #ifdef ASSERT
 420   // Check that locked label is reached with ZF set.
 421   Label zf_correct;
 422   Label zf_bad_zero;
 423   jcc(Assembler::zero, zf_correct);
 424   jmp(zf_bad_zero);
 425 #endif
 426 
 427   bind(slow_path);
 428 #ifdef ASSERT
 429   // Check that slow_path label is reached with ZF not set.
 430   jcc(Assembler::notZero, zf_correct);
 431   stop("Fast Lock ZF != 0");
 432   bind(zf_bad_zero);
 433   stop("Fast Lock ZF != 1");
 434   bind(zf_correct);
 435 #endif
 436   // C2 uses the value of ZF to determine the continuation.
 437 }
 438 
 439 // obj: object to lock
 440 // rax: tmp -- KILLED
 441 // t  : tmp - cannot be obj nor rax -- KILLED
 442 //
 443 // Some commentary on balanced locking:
 444 //
 445 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 446 // Methods that don't have provably balanced locking are forced to run in the
 447 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 448 // The interpreter provides two properties:
 449 // I1:  At return-time the interpreter automatically and quietly unlocks any
 450 //      objects acquired in the current activation (frame).  Recall that the
 451 //      interpreter maintains an on-stack list of locks currently held by
 452 //      a frame.
 453 // I2:  If a method attempts to unlock an object that is not held by the
 454 //      frame the interpreter throws IMSX.
 455 //
 456 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 457 // B() doesn't have provably balanced locking so it runs in the interpreter.
 458 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 459 // is still locked by A().
 460 //
 461 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface
 462 // Specification" states that an object locked by JNI's MonitorEnter should not be
 463 // unlocked by "normal" java-level locking and vice-versa.  The specification doesn't
 464 // specify what will occur if a program engages in such mixed-mode locking, however.
 465 // Arguably given that the spec legislates the JNI case as undefined our implementation
 466 // could reasonably *avoid* checking owner in fast_unlock().
 467 // In the interest of performance we elide m->Owner==Self check in unlock.
 468 // A perfectly viable alternative is to elide the owner check except when
 469 // Xcheck:jni is enabled.
 470 
 471 void C2_MacroAssembler::fast_unlock(Register obj, Register reg_rax, Register t, Register thread) {
 472   assert(reg_rax == rax, "Used for CAS");
 473   assert_different_registers(obj, reg_rax, t);
 474 
 475   // Handle inflated monitor.
 476   Label inflated, inflated_check_lock_stack;
 477   // Finish fast unlock successfully.  MUST jump with ZF == 1
 478   Label unlocked, slow_path;
 479 
 480   const Register mark = t;
 481   const Register monitor = t;
 482   const Register top = UseObjectMonitorTable ? t : reg_rax;
 483   const Register box = reg_rax;
 484 
 485   Label dummy;
 486   C2FastUnlockStub* stub = nullptr;
 487 
 488   if (!Compile::current()->output()->in_scratch_emit_size()) {
 489     stub = new (Compile::current()->comp_arena()) C2FastUnlockStub(obj, mark, reg_rax, thread);
 490     Compile::current()->output()->add_stub(stub);
 491   }
 492 
 493   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 494 
 495   { // Fast Unlock
 496 
 497     // Load top.
 498     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 499 
 500     if (!UseObjectMonitorTable) {
 501       // Prefetch mark.
 502       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 503     }
 504 
 505     // Check if obj is top of lock-stack.
 506     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 507     // Top of lock stack was not obj. Must be monitor.
 508     jcc(Assembler::notEqual, inflated_check_lock_stack);
 509 
 510     // Pop lock-stack.
 511     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 512     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 513 
 514     // Check if recursive.
 515     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 516     jcc(Assembler::equal, unlocked);
 517 
 518     // We elide the monitor check, let the CAS fail instead.
 519 
 520     if (UseObjectMonitorTable) {
 521       // Load mark.
 522       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 523     }
 524 
 525     // Try to unlock. Transition lock bits 0b00 => 0b01
 526     movptr(reg_rax, mark);
 527     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
 528     orptr(mark, markWord::unlocked_value);
 529     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 530     jcc(Assembler::notEqual, push_and_slow_path);
 531     jmp(unlocked);
 532   }
 533 
 534 
 535   { // Handle inflated monitor.
 536     bind(inflated_check_lock_stack);
 537 #ifdef ASSERT
 538     Label check_done;
 539     subl(top, oopSize);
 540     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
 541     jcc(Assembler::below, check_done);
 542     cmpptr(obj, Address(thread, top));
 543     jcc(Assembler::notEqual, inflated_check_lock_stack);
 544     stop("Fast Unlock lock on stack");
 545     bind(check_done);
 546     if (UseObjectMonitorTable) {
 547       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 548     }
 549     testptr(mark, markWord::monitor_value);
 550     jcc(Assembler::notZero, inflated);
 551     stop("Fast Unlock not monitor");
 552 #endif
 553 
 554     bind(inflated);
 555 
 556     if (!UseObjectMonitorTable) {
 557       assert(mark == monitor, "should be the same here");
 558     } else {
 559       // Uses ObjectMonitorTable.  Look for the monitor in our BasicLock on the stack.
 560       movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 561       // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
 562       cmpptr(monitor, alignof(ObjectMonitor*));
 563       jcc(Assembler::below, slow_path);
 564     }
 565     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 566     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
 567     const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
 568     const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag};
 569     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
 570 
 571     Label recursive;
 572 
 573     // Check if recursive.
 574     cmpptr(recursions_address, 0);
 575     jcc(Assembler::notZero, recursive);
 576 
 577     // Set owner to null.
 578     // Release to satisfy the JMM
 579     movptr(owner_address, NULL_WORD);
 580     // We need a full fence after clearing owner to avoid stranding.
 581     // StoreLoad achieves this.
 582     membar(StoreLoad);
 583 
 584     // Check if the entry_list is empty.
 585     cmpptr(entry_list_address, NULL_WORD);
 586     jcc(Assembler::zero, unlocked);    // If so we are done.
 587 
 588     // Check if there is a successor.
 589     cmpptr(succ_address, NULL_WORD);
 590     jcc(Assembler::notZero, unlocked); // If so we are done.
 591 
 592     // Save the monitor pointer in the current thread, so we can try to
 593     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 594     if (!UseObjectMonitorTable) {
 595       andptr(monitor, ~(int32_t)markWord::monitor_value);
 596     }
 597     movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
 598 
 599     orl(t, 1); // Fast Unlock ZF = 0
 600     jmpb(slow_path);
 601 
 602     // Recursive unlock.
 603     bind(recursive);
 604     decrement(recursions_address);
 605   }
 606 
 607   bind(unlocked);
 608   xorl(t, t); // Fast Unlock ZF = 1
 609 
 610 #ifdef ASSERT
 611   // Check that unlocked label is reached with ZF set.
 612   Label zf_correct;
 613   Label zf_bad_zero;
 614   jcc(Assembler::zero, zf_correct);
 615   jmp(zf_bad_zero);
 616 #endif
 617 
 618   bind(slow_path);
 619   if (stub != nullptr) {
 620     bind(stub->slow_path_continuation());
 621   }
 622 #ifdef ASSERT
 623   // Check that stub->continuation() label is reached with ZF not set.
 624   jcc(Assembler::notZero, zf_correct);
 625   stop("Fast Unlock ZF != 0");
 626   bind(zf_bad_zero);
 627   stop("Fast Unlock ZF != 1");
 628   bind(zf_correct);
 629 #endif
 630   // C2 uses the value of ZF to determine the continuation.
 631 }
 632 
 633 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
 634   fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
 635 }
 636 
 637 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) {
 638   const int framesize = Compile::current()->output()->frame_size_in_bytes();
 639   masm->movptr(dst, rsp);
 640   if (framesize > 2 * wordSize) {
 641     masm->addptr(dst, framesize - 2 * wordSize);
 642   }
 643 }
 644 
 645 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
 646   if (PreserveFramePointer) {
 647     // frame pointer is valid
 648 #ifdef ASSERT
 649     // Verify frame pointer value in rbp.
 650     reconstruct_frame_pointer_helper(this, rtmp);
 651     Label L_success;
 652     cmpq(rbp, rtmp);
 653     jccb(Assembler::equal, L_success);
 654     STOP("frame pointer mismatch");
 655     bind(L_success);
 656 #endif // ASSERT
 657   } else {
 658     reconstruct_frame_pointer_helper(this, rbp);
 659   }
 660 }
 661 
 662 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) {
 663   jint lo = t->_lo;
 664   jint hi = t->_hi;
 665   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi);
 666   if (t == TypeInt::INT) {
 667     return;
 668   }
 669 
 670   BLOCK_COMMENT("CastII {");
 671   Label fail;
 672   Label succeed;
 673 
 674   if (lo != min_jint) {
 675     cmpl(val, lo);
 676     jccb(Assembler::less, fail);
 677   }
 678   if (hi != max_jint) {
 679     cmpl(val, hi);
 680     jccb(Assembler::greater, fail);
 681   }
 682   jmpb(succeed);
 683 
 684   bind(fail);
 685   movl(c_rarg0, idx);
 686   movl(c_rarg1, val);
 687   movl(c_rarg2, lo);
 688   movl(c_rarg3, hi);
 689   reconstruct_frame_pointer(rscratch1);
 690   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range)));
 691   hlt();
 692   bind(succeed);
 693   BLOCK_COMMENT("} // CastII");
 694 }
 695 
 696 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
 697   fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
 698 }
 699 
 700 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) {
 701   jlong lo = t->_lo;
 702   jlong hi = t->_hi;
 703   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi);
 704   if (t == TypeLong::LONG) {
 705     return;
 706   }
 707 
 708   BLOCK_COMMENT("CastLL {");
 709   Label fail;
 710   Label succeed;
 711 
 712   auto cmp_val = [&](jlong bound) {
 713     if (is_simm32(bound)) {
 714       cmpq(val, checked_cast<int>(bound));
 715     } else {
 716       mov64(tmp, bound);
 717       cmpq(val, tmp);
 718     }
 719   };
 720 
 721   if (lo != min_jlong) {
 722     cmp_val(lo);
 723     jccb(Assembler::less, fail);
 724   }
 725   if (hi != max_jlong) {
 726     cmp_val(hi);
 727     jccb(Assembler::greater, fail);
 728   }
 729   jmpb(succeed);
 730 
 731   bind(fail);
 732   movl(c_rarg0, idx);
 733   movq(c_rarg1, val);
 734   mov64(c_rarg2, lo);
 735   mov64(c_rarg3, hi);
 736   reconstruct_frame_pointer(rscratch1);
 737   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range)));
 738   hlt();
 739   bind(succeed);
 740   BLOCK_COMMENT("} // CastLL");
 741 }
 742 
 743 //-------------------------------------------------------------------------------------------
 744 // Generic instructions support for use in .ad files C2 code generation
 745 
 746 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 747   if (dst != src) {
 748     movdqu(dst, src);
 749   }
 750   if (opcode == Op_AbsVD) {
 751     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 752   } else {
 753     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 754     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 755   }
 756 }
 757 
 758 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 759   if (opcode == Op_AbsVD) {
 760     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 761   } else {
 762     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 763     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 764   }
 765 }
 766 
 767 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 768   if (dst != src) {
 769     movdqu(dst, src);
 770   }
 771   if (opcode == Op_AbsVF) {
 772     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 773   } else {
 774     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 775     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 776   }
 777 }
 778 
 779 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 780   if (opcode == Op_AbsVF) {
 781     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 782   } else {
 783     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 784     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 785   }
 786 }
 787 
 788 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 789   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 790   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 791 
 792   if (opcode == Op_MinV) {
 793     if (elem_bt == T_BYTE) {
 794       pminsb(dst, src);
 795     } else if (elem_bt == T_SHORT) {
 796       pminsw(dst, src);
 797     } else if (elem_bt == T_INT) {
 798       pminsd(dst, src);
 799     } else {
 800       assert(elem_bt == T_LONG, "required");
 801       assert(tmp == xmm0, "required");
 802       assert_different_registers(dst, src, tmp);
 803       movdqu(xmm0, dst);
 804       pcmpgtq(xmm0, src);
 805       blendvpd(dst, src);  // xmm0 as mask
 806     }
 807   } else { // opcode == Op_MaxV
 808     if (elem_bt == T_BYTE) {
 809       pmaxsb(dst, src);
 810     } else if (elem_bt == T_SHORT) {
 811       pmaxsw(dst, src);
 812     } else if (elem_bt == T_INT) {
 813       pmaxsd(dst, src);
 814     } else {
 815       assert(elem_bt == T_LONG, "required");
 816       assert(tmp == xmm0, "required");
 817       assert_different_registers(dst, src, tmp);
 818       movdqu(xmm0, src);
 819       pcmpgtq(xmm0, dst);
 820       blendvpd(dst, src);  // xmm0 as mask
 821     }
 822   }
 823 }
 824 
 825 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 826                                   XMMRegister src1, Address src2, int vlen_enc) {
 827   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 828   if (opcode == Op_UMinV) {
 829     switch(elem_bt) {
 830       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 831       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 832       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 833       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 834       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 835     }
 836   } else {
 837     assert(opcode == Op_UMaxV, "required");
 838     switch(elem_bt) {
 839       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 840       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 841       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 842       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 843       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 844     }
 845   }
 846 }
 847 
 848 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
 849   // For optimality, leverage a full vector width of 512 bits
 850   // for operations over smaller vector sizes on AVX512 targets.
 851   if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 852     if (opcode == Op_UMaxV) {
 853       evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 854     } else {
 855       assert(opcode == Op_UMinV, "required");
 856       evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 857     }
 858   } else {
 859     // T1 = -1
 860     vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
 861     // T1 = -1 << 63
 862     vpsllq(xtmp1, xtmp1, 63, vlen_enc);
 863     // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
 864     vpaddq(xtmp2, xtmp1, src2, vlen_enc);
 865     // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
 866     vpaddq(xtmp1, xtmp1, src1, vlen_enc);
 867     // Mask = T2 > T1
 868     vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
 869     if (opcode == Op_UMaxV) {
 870       // Res = Mask ? Src2 : Src1
 871       vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
 872     } else {
 873       // Res = Mask ? Src1 : Src2
 874       vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
 875     }
 876   }
 877 }
 878 
 879 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 880                                   XMMRegister src1, XMMRegister src2, int vlen_enc) {
 881   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 882   if (opcode == Op_UMinV) {
 883     switch(elem_bt) {
 884       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 885       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 886       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 887       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 888       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 889     }
 890   } else {
 891     assert(opcode == Op_UMaxV, "required");
 892     switch(elem_bt) {
 893       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 894       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 895       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 896       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 897       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 898     }
 899   }
 900 }
 901 
 902 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 903                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 904                                  int vlen_enc) {
 905   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 906 
 907   if (opcode == Op_MinV) {
 908     if (elem_bt == T_BYTE) {
 909       vpminsb(dst, src1, src2, vlen_enc);
 910     } else if (elem_bt == T_SHORT) {
 911       vpminsw(dst, src1, src2, vlen_enc);
 912     } else if (elem_bt == T_INT) {
 913       vpminsd(dst, src1, src2, vlen_enc);
 914     } else {
 915       assert(elem_bt == T_LONG, "required");
 916       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 917         vpminsq(dst, src1, src2, vlen_enc);
 918       } else {
 919         assert_different_registers(dst, src1, src2);
 920         vpcmpgtq(dst, src1, src2, vlen_enc);
 921         vblendvpd(dst, src1, src2, dst, vlen_enc);
 922       }
 923     }
 924   } else { // opcode == Op_MaxV
 925     if (elem_bt == T_BYTE) {
 926       vpmaxsb(dst, src1, src2, vlen_enc);
 927     } else if (elem_bt == T_SHORT) {
 928       vpmaxsw(dst, src1, src2, vlen_enc);
 929     } else if (elem_bt == T_INT) {
 930       vpmaxsd(dst, src1, src2, vlen_enc);
 931     } else {
 932       assert(elem_bt == T_LONG, "required");
 933       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 934         vpmaxsq(dst, src1, src2, vlen_enc);
 935       } else {
 936         assert_different_registers(dst, src1, src2);
 937         vpcmpgtq(dst, src1, src2, vlen_enc);
 938         vblendvpd(dst, src2, src1, dst, vlen_enc);
 939       }
 940     }
 941   }
 942 }
 943 
 944 // Float/Double min max
 945 
 946 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
 947                                    XMMRegister dst, XMMRegister a, XMMRegister b,
 948                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
 949                                    int vlen_enc) {
 950   assert(UseAVX > 0, "required");
 951   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 952          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 953   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 954   assert_different_registers(a, tmp, atmp, btmp);
 955   assert_different_registers(b, tmp, atmp, btmp);
 956 
 957   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 958   bool is_double_word = is_double_word_type(elem_bt);
 959 
 960   /* Note on 'non-obvious' assembly sequence:
 961    *
 962    * While there are vminps/vmaxps instructions, there are two important differences between hardware
 963    * and Java on how they handle floats:
 964    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
 965    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
 966    *
 967    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
 968    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
 969    *                (only useful when signs differ, noop otherwise)
 970    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
 971 
 972    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
 973    *   btmp = (b < +0.0) ? a : b
 974    *   atmp = (b < +0.0) ? b : a
 975    *   Tmp  = Max_Float(atmp , btmp)
 976    *   Res  = (atmp == NaN) ? atmp : Tmp
 977    */
 978 
 979   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
 980   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
 981   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
 982   XMMRegister mask;
 983 
 984   if (!is_double_word && is_min) {
 985     mask = a;
 986     vblend = &MacroAssembler::vblendvps;
 987     vmaxmin = &MacroAssembler::vminps;
 988     vcmp = &MacroAssembler::vcmpps;
 989   } else if (!is_double_word && !is_min) {
 990     mask = b;
 991     vblend = &MacroAssembler::vblendvps;
 992     vmaxmin = &MacroAssembler::vmaxps;
 993     vcmp = &MacroAssembler::vcmpps;
 994   } else if (is_double_word && is_min) {
 995     mask = a;
 996     vblend = &MacroAssembler::vblendvpd;
 997     vmaxmin = &MacroAssembler::vminpd;
 998     vcmp = &MacroAssembler::vcmppd;
 999   } else {
1000     assert(is_double_word && !is_min, "sanity");
1001     mask = b;
1002     vblend = &MacroAssembler::vblendvpd;
1003     vmaxmin = &MacroAssembler::vmaxpd;
1004     vcmp = &MacroAssembler::vcmppd;
1005   }
1006 
1007   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
1008   XMMRegister maxmin, scratch;
1009   if (dst == btmp) {
1010     maxmin = btmp;
1011     scratch = tmp;
1012   } else {
1013     maxmin = tmp;
1014     scratch = btmp;
1015   }
1016 
1017   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1018   if (precompute_mask && !is_double_word) {
1019     vpsrad(tmp, mask, 32, vlen_enc);
1020     mask = tmp;
1021   } else if (precompute_mask && is_double_word) {
1022     vpxor(tmp, tmp, tmp, vlen_enc);
1023     vpcmpgtq(tmp, tmp, mask, vlen_enc);
1024     mask = tmp;
1025   }
1026 
1027   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1028   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1029   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1030   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1031   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1032 }
1033 
1034 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1035                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1036                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1037                                     int vlen_enc) {
1038   assert(UseAVX > 2, "required");
1039   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1040          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1041   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1042   assert_different_registers(dst, a, atmp, btmp);
1043   assert_different_registers(dst, b, atmp, btmp);
1044 
1045   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1046   bool is_double_word = is_double_word_type(elem_bt);
1047   bool merge = true;
1048 
1049   if (!is_double_word && is_min) {
1050     evpmovd2m(ktmp, a, vlen_enc);
1051     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1052     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1053     vminps(dst, atmp, btmp, vlen_enc);
1054     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1055     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1056   } else if (!is_double_word && !is_min) {
1057     evpmovd2m(ktmp, b, vlen_enc);
1058     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1059     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1060     vmaxps(dst, atmp, btmp, vlen_enc);
1061     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1062     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1063   } else if (is_double_word && is_min) {
1064     evpmovq2m(ktmp, a, vlen_enc);
1065     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1066     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1067     vminpd(dst, atmp, btmp, vlen_enc);
1068     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1069     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1070   } else {
1071     assert(is_double_word && !is_min, "sanity");
1072     evpmovq2m(ktmp, b, vlen_enc);
1073     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1074     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1075     vmaxpd(dst, atmp, btmp, vlen_enc);
1076     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1077     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1078   }
1079 }
1080 
1081 void C2_MacroAssembler::vminmax_fp(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1082                                    XMMRegister src1, XMMRegister src2, int vlen_enc) {
1083   assert(opc == Op_MinV || opc == Op_MinReductionV ||
1084          opc == Op_MaxV || opc == Op_MaxReductionV, "sanity");
1085 
1086   int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_2_MINMAX_MIN_COMPARE_SIGN
1087                                                          : AVX10_2_MINMAX_MAX_COMPARE_SIGN;
1088   if (elem_bt == T_FLOAT) {
1089     evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc);
1090   } else {
1091     assert(elem_bt == T_DOUBLE, "");
1092     evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc);
1093   }
1094 }
1095 
1096 // Float/Double signum
1097 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1098   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1099 
1100   Label DONE_LABEL;
1101 
1102   // Handle special cases +0.0/-0.0 and NaN, if argument is +0.0/-0.0 or NaN, return argument
1103   // If AVX10.2 (or newer) floating point comparison instructions used, SF=1 for equal and unordered cases
1104   // If other floating point comparison instructions used, ZF=1 for equal and unordered cases
1105   if (opcode == Op_SignumF) {
1106     if (VM_Version::supports_avx10_2()) {
1107       vucomxss(dst, zero);
1108       jcc(Assembler::negative, DONE_LABEL);
1109     } else {
1110       ucomiss(dst, zero);
1111       jcc(Assembler::equal, DONE_LABEL);
1112     }
1113     movflt(dst, one);
1114     jcc(Assembler::above, DONE_LABEL);
1115     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1116   } else if (opcode == Op_SignumD) {
1117     if (VM_Version::supports_avx10_2()) {
1118       vucomxsd(dst, zero);
1119       jcc(Assembler::negative, DONE_LABEL);
1120     } else {
1121       ucomisd(dst, zero);
1122       jcc(Assembler::equal, DONE_LABEL);
1123     }
1124     movdbl(dst, one);
1125     jcc(Assembler::above, DONE_LABEL);
1126     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1127   }
1128 
1129   bind(DONE_LABEL);
1130 }
1131 
1132 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1133   if (sign) {
1134     pmovsxbw(dst, src);
1135   } else {
1136     pmovzxbw(dst, src);
1137   }
1138 }
1139 
1140 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1141   if (sign) {
1142     vpmovsxbw(dst, src, vector_len);
1143   } else {
1144     vpmovzxbw(dst, src, vector_len);
1145   }
1146 }
1147 
1148 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1149   if (sign) {
1150     vpmovsxbd(dst, src, vector_len);
1151   } else {
1152     vpmovzxbd(dst, src, vector_len);
1153   }
1154 }
1155 
1156 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1157   if (sign) {
1158     vpmovsxwd(dst, src, vector_len);
1159   } else {
1160     vpmovzxwd(dst, src, vector_len);
1161   }
1162 }
1163 
1164 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1165                                      int shift, int vector_len) {
1166   if (opcode == Op_RotateLeftV) {
1167     if (etype == T_INT) {
1168       evprold(dst, src, shift, vector_len);
1169     } else {
1170       assert(etype == T_LONG, "expected type T_LONG");
1171       evprolq(dst, src, shift, vector_len);
1172     }
1173   } else {
1174     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1175     if (etype == T_INT) {
1176       evprord(dst, src, shift, vector_len);
1177     } else {
1178       assert(etype == T_LONG, "expected type T_LONG");
1179       evprorq(dst, src, shift, vector_len);
1180     }
1181   }
1182 }
1183 
1184 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1185                                      XMMRegister shift, int vector_len) {
1186   if (opcode == Op_RotateLeftV) {
1187     if (etype == T_INT) {
1188       evprolvd(dst, src, shift, vector_len);
1189     } else {
1190       assert(etype == T_LONG, "expected type T_LONG");
1191       evprolvq(dst, src, shift, vector_len);
1192     }
1193   } else {
1194     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1195     if (etype == T_INT) {
1196       evprorvd(dst, src, shift, vector_len);
1197     } else {
1198       assert(etype == T_LONG, "expected type T_LONG");
1199       evprorvq(dst, src, shift, vector_len);
1200     }
1201   }
1202 }
1203 
1204 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1205   if (opcode == Op_RShiftVI) {
1206     psrad(dst, shift);
1207   } else if (opcode == Op_LShiftVI) {
1208     pslld(dst, shift);
1209   } else {
1210     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1211     psrld(dst, shift);
1212   }
1213 }
1214 
1215 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1216   switch (opcode) {
1217     case Op_RShiftVI:  psrad(dst, shift); break;
1218     case Op_LShiftVI:  pslld(dst, shift); break;
1219     case Op_URShiftVI: psrld(dst, shift); break;
1220 
1221     default: assert(false, "%s", NodeClassNames[opcode]);
1222   }
1223 }
1224 
1225 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1226   if (opcode == Op_RShiftVI) {
1227     vpsrad(dst, nds, shift, vector_len);
1228   } else if (opcode == Op_LShiftVI) {
1229     vpslld(dst, nds, shift, vector_len);
1230   } else {
1231     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1232     vpsrld(dst, nds, shift, vector_len);
1233   }
1234 }
1235 
1236 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1237   switch (opcode) {
1238     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1239     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1240     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1241 
1242     default: assert(false, "%s", NodeClassNames[opcode]);
1243   }
1244 }
1245 
1246 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1247   switch (opcode) {
1248     case Op_RShiftVB:  // fall-through
1249     case Op_RShiftVS:  psraw(dst, shift); break;
1250 
1251     case Op_LShiftVB:  // fall-through
1252     case Op_LShiftVS:  psllw(dst, shift);   break;
1253 
1254     case Op_URShiftVS: // fall-through
1255     case Op_URShiftVB: psrlw(dst, shift);  break;
1256 
1257     default: assert(false, "%s", NodeClassNames[opcode]);
1258   }
1259 }
1260 
1261 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1262   switch (opcode) {
1263     case Op_RShiftVB:  // fall-through
1264     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1265 
1266     case Op_LShiftVB:  // fall-through
1267     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1268 
1269     case Op_URShiftVS: // fall-through
1270     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1271 
1272     default: assert(false, "%s", NodeClassNames[opcode]);
1273   }
1274 }
1275 
1276 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1277   switch (opcode) {
1278     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1279     case Op_LShiftVL:  psllq(dst, shift); break;
1280     case Op_URShiftVL: psrlq(dst, shift); break;
1281 
1282     default: assert(false, "%s", NodeClassNames[opcode]);
1283   }
1284 }
1285 
1286 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1287   if (opcode == Op_RShiftVL) {
1288     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1289   } else if (opcode == Op_LShiftVL) {
1290     psllq(dst, shift);
1291   } else {
1292     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1293     psrlq(dst, shift);
1294   }
1295 }
1296 
1297 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1298   switch (opcode) {
1299     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1300     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1301     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1302 
1303     default: assert(false, "%s", NodeClassNames[opcode]);
1304   }
1305 }
1306 
1307 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1308   if (opcode == Op_RShiftVL) {
1309     evpsraq(dst, nds, shift, vector_len);
1310   } else if (opcode == Op_LShiftVL) {
1311     vpsllq(dst, nds, shift, vector_len);
1312   } else {
1313     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1314     vpsrlq(dst, nds, shift, vector_len);
1315   }
1316 }
1317 
1318 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1319   switch (opcode) {
1320     case Op_RShiftVB:  // fall-through
1321     case Op_RShiftVS:  // fall-through
1322     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1323 
1324     case Op_LShiftVB:  // fall-through
1325     case Op_LShiftVS:  // fall-through
1326     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1327 
1328     case Op_URShiftVB: // fall-through
1329     case Op_URShiftVS: // fall-through
1330     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1331 
1332     default: assert(false, "%s", NodeClassNames[opcode]);
1333   }
1334 }
1335 
1336 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1337   switch (opcode) {
1338     case Op_RShiftVB:  // fall-through
1339     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1340 
1341     case Op_LShiftVB:  // fall-through
1342     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1343 
1344     case Op_URShiftVB: // fall-through
1345     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1346 
1347     default: assert(false, "%s", NodeClassNames[opcode]);
1348   }
1349 }
1350 
1351 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1352   assert(UseAVX >= 2, "required");
1353   switch (opcode) {
1354     case Op_RShiftVL: {
1355       if (UseAVX > 2) {
1356         assert(tmp == xnoreg, "not used");
1357         if (!VM_Version::supports_avx512vl()) {
1358           vlen_enc = Assembler::AVX_512bit;
1359         }
1360         evpsravq(dst, src, shift, vlen_enc);
1361       } else {
1362         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1363         vpsrlvq(dst, src, shift, vlen_enc);
1364         vpsrlvq(tmp, tmp, shift, vlen_enc);
1365         vpxor(dst, dst, tmp, vlen_enc);
1366         vpsubq(dst, dst, tmp, vlen_enc);
1367       }
1368       break;
1369     }
1370     case Op_LShiftVL: {
1371       assert(tmp == xnoreg, "not used");
1372       vpsllvq(dst, src, shift, vlen_enc);
1373       break;
1374     }
1375     case Op_URShiftVL: {
1376       assert(tmp == xnoreg, "not used");
1377       vpsrlvq(dst, src, shift, vlen_enc);
1378       break;
1379     }
1380     default: assert(false, "%s", NodeClassNames[opcode]);
1381   }
1382 }
1383 
1384 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1385 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1386   assert(opcode == Op_LShiftVB ||
1387          opcode == Op_RShiftVB ||
1388          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1389   bool sign = (opcode != Op_URShiftVB);
1390   assert(vector_len == 0, "required");
1391   vextendbd(sign, dst, src, 1);
1392   vpmovzxbd(vtmp, shift, 1);
1393   varshiftd(opcode, dst, dst, vtmp, 1);
1394   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1395   vextracti128_high(vtmp, dst);
1396   vpackusdw(dst, dst, vtmp, 0);
1397 }
1398 
1399 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1400 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1401   assert(opcode == Op_LShiftVB ||
1402          opcode == Op_RShiftVB ||
1403          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1404   bool sign = (opcode != Op_URShiftVB);
1405   int ext_vector_len = vector_len + 1;
1406   vextendbw(sign, dst, src, ext_vector_len);
1407   vpmovzxbw(vtmp, shift, ext_vector_len);
1408   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1409   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1410   if (vector_len == 0) {
1411     vextracti128_high(vtmp, dst);
1412     vpackuswb(dst, dst, vtmp, vector_len);
1413   } else {
1414     vextracti64x4_high(vtmp, dst);
1415     vpackuswb(dst, dst, vtmp, vector_len);
1416     vpermq(dst, dst, 0xD8, vector_len);
1417   }
1418 }
1419 
1420 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1421   switch(typ) {
1422     case T_BYTE:
1423       pinsrb(dst, val, idx);
1424       break;
1425     case T_SHORT:
1426       pinsrw(dst, val, idx);
1427       break;
1428     case T_INT:
1429       pinsrd(dst, val, idx);
1430       break;
1431     case T_LONG:
1432       pinsrq(dst, val, idx);
1433       break;
1434     default:
1435       assert(false,"Should not reach here.");
1436       break;
1437   }
1438 }
1439 
1440 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1441   switch(typ) {
1442     case T_BYTE:
1443       vpinsrb(dst, src, val, idx);
1444       break;
1445     case T_SHORT:
1446       vpinsrw(dst, src, val, idx);
1447       break;
1448     case T_INT:
1449       vpinsrd(dst, src, val, idx);
1450       break;
1451     case T_LONG:
1452       vpinsrq(dst, src, val, idx);
1453       break;
1454     default:
1455       assert(false,"Should not reach here.");
1456       break;
1457   }
1458 }
1459 
1460 void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst,
1461                                          Register base, Register idx_base,
1462                                          Register mask, Register mask_idx,
1463                                          Register rtmp, int vlen_enc) {
1464   vpxor(dst, dst, dst, vlen_enc);
1465   if (elem_bt == T_SHORT) {
1466     for (int i = 0; i < 4; i++) {
1467       // dst[i] = mask[i] ? src[idx_base[i]] : 0
1468       Label skip_load;
1469       btq(mask, mask_idx);
1470       jccb(Assembler::carryClear, skip_load);
1471       movl(rtmp, Address(idx_base, i * 4));
1472       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1473       bind(skip_load);
1474       incq(mask_idx);
1475     }
1476   } else {
1477     assert(elem_bt == T_BYTE, "");
1478     for (int i = 0; i < 8; i++) {
1479       // dst[i] = mask[i] ? src[idx_base[i]] : 0
1480       Label skip_load;
1481       btq(mask, mask_idx);
1482       jccb(Assembler::carryClear, skip_load);
1483       movl(rtmp, Address(idx_base, i * 4));
1484       pinsrb(dst, Address(base, rtmp), i);
1485       bind(skip_load);
1486       incq(mask_idx);
1487     }
1488   }
1489 }
1490 
1491 void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst,
1492                                   Register base, Register idx_base,
1493                                   Register rtmp, int vlen_enc) {
1494   vpxor(dst, dst, dst, vlen_enc);
1495   if (elem_bt == T_SHORT) {
1496     for (int i = 0; i < 4; i++) {
1497       // dst[i] = src[idx_base[i]]
1498       movl(rtmp, Address(idx_base, i * 4));
1499       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1500     }
1501   } else {
1502     assert(elem_bt == T_BYTE, "");
1503     for (int i = 0; i < 8; i++) {
1504       // dst[i] = src[idx_base[i]]
1505       movl(rtmp, Address(idx_base, i * 4));
1506       pinsrb(dst, Address(base, rtmp), i);
1507     }
1508   }
1509 }
1510 
1511 /*
1512  * Gather using hybrid algorithm, first partially unroll scalar loop
1513  * to accumulate values from gather indices into a quad-word(64bit) slice.
1514  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1515  * permutation to place the slice into appropriate vector lane
1516  * locations in destination vector. Following pseudo code describes the
1517  * algorithm in detail:
1518  *
1519  * DST_VEC = ZERO_VEC
1520  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1521  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1522  * FOREACH_ITER:
1523  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1524  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1525  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1526  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1527  *
1528  * With each iteration, doubleword permute indices (0,1) corresponding
1529  * to gathered quadword gets right shifted by two lane positions.
1530  *
1531  */
1532 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1533                                         Register base, Register idx_base,
1534                                         Register mask, XMMRegister xtmp1,
1535                                         XMMRegister xtmp2, XMMRegister temp_dst,
1536                                         Register rtmp, Register mask_idx,
1537                                         Register length, int vector_len, int vlen_enc) {
1538   Label GATHER8_LOOP;
1539   assert(is_subword_type(elem_ty), "");
1540   movl(length, vector_len);
1541   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1542   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1543   vallones(xtmp2, vlen_enc);
1544   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1545   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1546   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1547 
1548   bind(GATHER8_LOOP);
1549     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1550     if (mask == noreg) {
1551       vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc);
1552     } else {
1553       vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc);
1554     }
1555     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1556     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1557     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1558     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1559     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1560     vpor(dst, dst, temp_dst, vlen_enc);
1561     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1562     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1563     jcc(Assembler::notEqual, GATHER8_LOOP);
1564 }
1565 
1566 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1567   switch(typ) {
1568     case T_INT:
1569       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1570       break;
1571     case T_FLOAT:
1572       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1573       break;
1574     case T_LONG:
1575       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1576       break;
1577     case T_DOUBLE:
1578       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1579       break;
1580     default:
1581       assert(false,"Should not reach here.");
1582       break;
1583   }
1584 }
1585 
1586 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1587   switch(typ) {
1588     case T_INT:
1589       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1590       break;
1591     case T_FLOAT:
1592       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1593       break;
1594     case T_LONG:
1595       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1596       break;
1597     case T_DOUBLE:
1598       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1599       break;
1600     default:
1601       assert(false,"Should not reach here.");
1602       break;
1603   }
1604 }
1605 
1606 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1607   switch(typ) {
1608     case T_INT:
1609       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1610       break;
1611     case T_FLOAT:
1612       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1613       break;
1614     case T_LONG:
1615       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1616       break;
1617     case T_DOUBLE:
1618       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1619       break;
1620     default:
1621       assert(false,"Should not reach here.");
1622       break;
1623   }
1624 }
1625 
1626 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1627   if (vlen_in_bytes <= 16) {
1628     pxor (dst, dst);
1629     psubb(dst, src);
1630     switch (elem_bt) {
1631       case T_BYTE:   /* nothing to do */ break;
1632       case T_SHORT:  pmovsxbw(dst, dst); break;
1633       case T_INT:    pmovsxbd(dst, dst); break;
1634       case T_FLOAT:  pmovsxbd(dst, dst); break;
1635       case T_LONG:   pmovsxbq(dst, dst); break;
1636       case T_DOUBLE: pmovsxbq(dst, dst); break;
1637 
1638       default: assert(false, "%s", type2name(elem_bt));
1639     }
1640   } else {
1641     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1642     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1643 
1644     vpxor (dst, dst, dst, vlen_enc);
1645     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1646 
1647     switch (elem_bt) {
1648       case T_BYTE:   /* nothing to do */            break;
1649       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1650       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1651       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1652       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1653       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1654 
1655       default: assert(false, "%s", type2name(elem_bt));
1656     }
1657   }
1658 }
1659 
1660 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1661   if (novlbwdq) {
1662     vpmovsxbd(xtmp, src, vlen_enc);
1663     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1664             Assembler::eq, true, vlen_enc, noreg);
1665   } else {
1666     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1667     vpsubb(xtmp, xtmp, src, vlen_enc);
1668     evpmovb2m(dst, xtmp, vlen_enc);
1669   }
1670 }
1671 
1672 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1673   if (is_integral_type(bt)) {
1674     switch (vlen_in_bytes) {
1675       case 4:  movdl(dst, src);   break;
1676       case 8:  movq(dst, src);    break;
1677       case 16: movdqu(dst, src);  break;
1678       case 32: vmovdqu(dst, src); break;
1679       case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1680       default: ShouldNotReachHere();
1681     }
1682   } else {
1683     switch (vlen_in_bytes) {
1684       case 4:  movflt(dst, src); break;
1685       case 8:  movdbl(dst, src); break;
1686       case 16: movups(dst, src); break;
1687       case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1688       case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1689       default: ShouldNotReachHere();
1690     }
1691   }
1692 }
1693 
1694 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1695   assert(rscratch != noreg || always_reachable(src), "missing");
1696 
1697   if (reachable(src)) {
1698     load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1699   } else {
1700     lea(rscratch, src);
1701     load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1702   }
1703 }
1704 
1705 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1706   int vlen_enc = vector_length_encoding(vlen);
1707   if (VM_Version::supports_avx()) {
1708     if (bt == T_LONG) {
1709       if (VM_Version::supports_avx2()) {
1710         vpbroadcastq(dst, src, vlen_enc);
1711       } else {
1712         vmovddup(dst, src, vlen_enc);
1713       }
1714     } else if (bt == T_DOUBLE) {
1715       if (vlen_enc != Assembler::AVX_128bit) {
1716         vbroadcastsd(dst, src, vlen_enc, noreg);
1717       } else {
1718         vmovddup(dst, src, vlen_enc);
1719       }
1720     } else {
1721       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1722         vpbroadcastd(dst, src, vlen_enc);
1723       } else {
1724         vbroadcastss(dst, src, vlen_enc);
1725       }
1726     }
1727   } else if (VM_Version::supports_sse3()) {
1728     movddup(dst, src);
1729   } else {
1730     load_vector(bt, dst, src, vlen);
1731   }
1732 }
1733 
1734 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1735   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1736   int offset = exact_log2(type2aelembytes(bt)) << 6;
1737   if (is_floating_point_type(bt)) {
1738     offset += 128;
1739   }
1740   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1741   load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1742 }
1743 
1744 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1745 
1746 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1747   int vector_len = Assembler::AVX_128bit;
1748 
1749   switch (opcode) {
1750     case Op_AndReductionV:  pand(dst, src); break;
1751     case Op_OrReductionV:   por (dst, src); break;
1752     case Op_XorReductionV:  pxor(dst, src); break;
1753     case Op_MinReductionV:
1754       switch (typ) {
1755         case T_BYTE:        pminsb(dst, src); break;
1756         case T_SHORT:       pminsw(dst, src); break;
1757         case T_INT:         pminsd(dst, src); break;
1758         case T_LONG:        assert(UseAVX > 2, "required");
1759                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1760         default:            assert(false, "wrong type");
1761       }
1762       break;
1763     case Op_MaxReductionV:
1764       switch (typ) {
1765         case T_BYTE:        pmaxsb(dst, src); break;
1766         case T_SHORT:       pmaxsw(dst, src); break;
1767         case T_INT:         pmaxsd(dst, src); break;
1768         case T_LONG:        assert(UseAVX > 2, "required");
1769                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1770         default:            assert(false, "wrong type");
1771       }
1772       break;
1773     case Op_UMinReductionV:
1774       switch (typ) {
1775         case T_BYTE:        vpminub(dst, dst, src, Assembler::AVX_128bit); break;
1776         case T_SHORT:       vpminuw(dst, dst, src, Assembler::AVX_128bit); break;
1777         case T_INT:         vpminud(dst, dst, src, Assembler::AVX_128bit); break;
1778         case T_LONG:        evpminuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break;
1779         default:            assert(false, "wrong type");
1780       }
1781       break;
1782     case Op_UMaxReductionV:
1783       switch (typ) {
1784         case T_BYTE:        vpmaxub(dst, dst, src, Assembler::AVX_128bit); break;
1785         case T_SHORT:       vpmaxuw(dst, dst, src, Assembler::AVX_128bit); break;
1786         case T_INT:         vpmaxud(dst, dst, src, Assembler::AVX_128bit); break;
1787         case T_LONG:        evpmaxuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break;
1788         default:            assert(false, "wrong type");
1789       }
1790       break;
1791     case Op_AddReductionVF: addss(dst, src); break;
1792     case Op_AddReductionVD: addsd(dst, src); break;
1793     case Op_AddReductionVI:
1794       switch (typ) {
1795         case T_BYTE:        paddb(dst, src); break;
1796         case T_SHORT:       paddw(dst, src); break;
1797         case T_INT:         paddd(dst, src); break;
1798         default:            assert(false, "wrong type");
1799       }
1800       break;
1801     case Op_AddReductionVL: paddq(dst, src); break;
1802     case Op_MulReductionVF: mulss(dst, src); break;
1803     case Op_MulReductionVD: mulsd(dst, src); break;
1804     case Op_MulReductionVI:
1805       switch (typ) {
1806         case T_SHORT:       pmullw(dst, src); break;
1807         case T_INT:         pmulld(dst, src); break;
1808         default:            assert(false, "wrong type");
1809       }
1810       break;
1811     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1812                             evpmullq(dst, dst, src, vector_len); break;
1813     default:                assert(false, "wrong opcode");
1814   }
1815 }
1816 
1817 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1818   switch (opcode) {
1819     case Op_AddReductionVF: addps(dst, src); break;
1820     case Op_AddReductionVD: addpd(dst, src); break;
1821     case Op_MulReductionVF: mulps(dst, src); break;
1822     case Op_MulReductionVD: mulpd(dst, src); break;
1823     default:                assert(false, "%s", NodeClassNames[opcode]);
1824   }
1825 }
1826 
1827 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1828   int vector_len = Assembler::AVX_256bit;
1829 
1830   switch (opcode) {
1831     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1832     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1833     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1834     case Op_MinReductionV:
1835       switch (typ) {
1836         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1837         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1838         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1839         case T_LONG:        assert(UseAVX > 2, "required");
1840                             vpminsq(dst, src1, src2, vector_len); break;
1841         default:            assert(false, "wrong type");
1842       }
1843       break;
1844     case Op_MaxReductionV:
1845       switch (typ) {
1846         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1847         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1848         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1849         case T_LONG:        assert(UseAVX > 2, "required");
1850                             vpmaxsq(dst, src1, src2, vector_len); break;
1851         default:            assert(false, "wrong type");
1852       }
1853       break;
1854     case Op_UMinReductionV:
1855       switch (typ) {
1856         case T_BYTE:        vpminub(dst, src1, src2, vector_len); break;
1857         case T_SHORT:       vpminuw(dst, src1, src2, vector_len); break;
1858         case T_INT:         vpminud(dst, src1, src2, vector_len); break;
1859         case T_LONG:        evpminuq(dst, k0, src1, src2, true, vector_len); break;
1860         default:            assert(false, "wrong type");
1861       }
1862       break;
1863     case Op_UMaxReductionV:
1864       switch (typ) {
1865         case T_BYTE:        vpmaxub(dst, src1, src2, vector_len); break;
1866         case T_SHORT:       vpmaxuw(dst, src1, src2, vector_len); break;
1867         case T_INT:         vpmaxud(dst, src1, src2, vector_len); break;
1868         case T_LONG:        evpmaxuq(dst, k0, src1, src2, true, vector_len); break;
1869         default:            assert(false, "wrong type");
1870       }
1871       break;
1872     case Op_AddReductionVI:
1873       switch (typ) {
1874         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1875         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1876         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1877         default:            assert(false, "wrong type");
1878       }
1879       break;
1880     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1881     case Op_MulReductionVI:
1882       switch (typ) {
1883         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1884         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1885         default:            assert(false, "wrong type");
1886       }
1887       break;
1888     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1889     default:                assert(false, "wrong opcode");
1890   }
1891 }
1892 
1893 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1894   int vector_len = Assembler::AVX_256bit;
1895 
1896   switch (opcode) {
1897     case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1898     case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1899     case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1900     case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1901     default:                assert(false, "%s", NodeClassNames[opcode]);
1902   }
1903 }
1904 
1905 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1906                                   XMMRegister dst, XMMRegister src,
1907                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1908   switch (opcode) {
1909     case Op_AddReductionVF:
1910     case Op_MulReductionVF:
1911       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1912       break;
1913 
1914     case Op_AddReductionVD:
1915     case Op_MulReductionVD:
1916       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1917       break;
1918 
1919     default: assert(false, "wrong opcode");
1920   }
1921 }
1922 
1923 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1924                                             XMMRegister dst, XMMRegister src,
1925                                             XMMRegister vtmp1, XMMRegister vtmp2) {
1926   switch (opcode) {
1927     case Op_AddReductionVF:
1928     case Op_MulReductionVF:
1929       unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1930       break;
1931 
1932     case Op_AddReductionVD:
1933     case Op_MulReductionVD:
1934       unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1935       break;
1936 
1937     default: assert(false, "%s", NodeClassNames[opcode]);
1938   }
1939 }
1940 
1941 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1942                              Register dst, Register src1, XMMRegister src2,
1943                              XMMRegister vtmp1, XMMRegister vtmp2) {
1944   switch (vlen) {
1945     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1946     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1947     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1948     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1949 
1950     default: assert(false, "wrong vector length");
1951   }
1952 }
1953 
1954 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1955                              Register dst, Register src1, XMMRegister src2,
1956                              XMMRegister vtmp1, XMMRegister vtmp2) {
1957   switch (vlen) {
1958     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1959     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1960     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1961     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1962 
1963     default: assert(false, "wrong vector length");
1964   }
1965 }
1966 
1967 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1968                              Register dst, Register src1, XMMRegister src2,
1969                              XMMRegister vtmp1, XMMRegister vtmp2) {
1970   switch (vlen) {
1971     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1972     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1973     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1974     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1975 
1976     default: assert(false, "wrong vector length");
1977   }
1978 }
1979 
1980 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1981                              Register dst, Register src1, XMMRegister src2,
1982                              XMMRegister vtmp1, XMMRegister vtmp2) {
1983   switch (vlen) {
1984     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1985     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1986     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1987     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1988 
1989     default: assert(false, "wrong vector length");
1990   }
1991 }
1992 
1993 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1994                              Register dst, Register src1, XMMRegister src2,
1995                              XMMRegister vtmp1, XMMRegister vtmp2) {
1996   switch (vlen) {
1997     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1998     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1999     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2000 
2001     default: assert(false, "wrong vector length");
2002   }
2003 }
2004 
2005 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2006   switch (vlen) {
2007     case 2:
2008       assert(vtmp2 == xnoreg, "");
2009       reduce2F(opcode, dst, src, vtmp1);
2010       break;
2011     case 4:
2012       assert(vtmp2 == xnoreg, "");
2013       reduce4F(opcode, dst, src, vtmp1);
2014       break;
2015     case 8:
2016       reduce8F(opcode, dst, src, vtmp1, vtmp2);
2017       break;
2018     case 16:
2019       reduce16F(opcode, dst, src, vtmp1, vtmp2);
2020       break;
2021     default: assert(false, "wrong vector length");
2022   }
2023 }
2024 
2025 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2026   switch (vlen) {
2027     case 2:
2028       assert(vtmp2 == xnoreg, "");
2029       reduce2D(opcode, dst, src, vtmp1);
2030       break;
2031     case 4:
2032       reduce4D(opcode, dst, src, vtmp1, vtmp2);
2033       break;
2034     case 8:
2035       reduce8D(opcode, dst, src, vtmp1, vtmp2);
2036       break;
2037     default: assert(false, "wrong vector length");
2038   }
2039 }
2040 
2041 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2042   switch (vlen) {
2043     case 2:
2044       assert(vtmp1 == xnoreg, "");
2045       assert(vtmp2 == xnoreg, "");
2046       unorderedReduce2F(opcode, dst, src);
2047       break;
2048     case 4:
2049       assert(vtmp2 == xnoreg, "");
2050       unorderedReduce4F(opcode, dst, src, vtmp1);
2051       break;
2052     case 8:
2053       unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
2054       break;
2055     case 16:
2056       unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
2057       break;
2058     default: assert(false, "wrong vector length");
2059   }
2060 }
2061 
2062 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2063   switch (vlen) {
2064     case 2:
2065       assert(vtmp1 == xnoreg, "");
2066       assert(vtmp2 == xnoreg, "");
2067       unorderedReduce2D(opcode, dst, src);
2068       break;
2069     case 4:
2070       assert(vtmp2 == xnoreg, "");
2071       unorderedReduce4D(opcode, dst, src, vtmp1);
2072       break;
2073     case 8:
2074       unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2075       break;
2076     default: assert(false, "wrong vector length");
2077   }
2078 }
2079 
2080 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2081   if (opcode == Op_AddReductionVI) {
2082     if (vtmp1 != src2) {
2083       movdqu(vtmp1, src2);
2084     }
2085     phaddd(vtmp1, vtmp1);
2086   } else {
2087     pshufd(vtmp1, src2, 0x1);
2088     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2089   }
2090   movdl(vtmp2, src1);
2091   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2092   movdl(dst, vtmp1);
2093 }
2094 
2095 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2096   if (opcode == Op_AddReductionVI) {
2097     if (vtmp1 != src2) {
2098       movdqu(vtmp1, src2);
2099     }
2100     phaddd(vtmp1, src2);
2101     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2102   } else {
2103     pshufd(vtmp2, src2, 0xE);
2104     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2105     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2106   }
2107 }
2108 
2109 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2110   if (opcode == Op_AddReductionVI) {
2111     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2112     vextracti128_high(vtmp2, vtmp1);
2113     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2114     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2115   } else {
2116     vextracti128_high(vtmp1, src2);
2117     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2118     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2119   }
2120 }
2121 
2122 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2123   vextracti64x4_high(vtmp2, src2);
2124   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2125   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2126 }
2127 
2128 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2129   pshufd(vtmp2, src2, 0x1);
2130   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2131   movdqu(vtmp1, vtmp2);
2132   psrldq(vtmp1, 2);
2133   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2134   movdqu(vtmp2, vtmp1);
2135   psrldq(vtmp2, 1);
2136   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2137   movdl(vtmp2, src1);
2138   if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) {
2139     pmovzxbd(vtmp1, vtmp1);
2140   } else {
2141     pmovsxbd(vtmp1, vtmp1);
2142   }
2143   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2144   pextrb(dst, vtmp1, 0x0);
2145   movsbl(dst, dst);
2146 }
2147 
2148 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2149   pshufd(vtmp1, src2, 0xE);
2150   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2151   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2152 }
2153 
2154 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2155   vextracti128_high(vtmp2, src2);
2156   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2157   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2158 }
2159 
2160 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2161   vextracti64x4_high(vtmp1, src2);
2162   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2163   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2164 }
2165 
2166 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2167   pmovsxbw(vtmp2, src2);
2168   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2169 }
2170 
2171 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2172   if (UseAVX > 1) {
2173     int vector_len = Assembler::AVX_256bit;
2174     vpmovsxbw(vtmp1, src2, vector_len);
2175     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2176   } else {
2177     pmovsxbw(vtmp2, src2);
2178     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2179     pshufd(vtmp2, src2, 0x1);
2180     pmovsxbw(vtmp2, src2);
2181     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2182   }
2183 }
2184 
2185 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2186   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2187     int vector_len = Assembler::AVX_512bit;
2188     vpmovsxbw(vtmp1, src2, vector_len);
2189     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2190   } else {
2191     assert(UseAVX >= 2,"Should not reach here.");
2192     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2193     vextracti128_high(vtmp2, src2);
2194     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2195   }
2196 }
2197 
2198 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2199   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2200   vextracti64x4_high(vtmp2, src2);
2201   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2202 }
2203 
2204 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2205   if (opcode == Op_AddReductionVI) {
2206     if (vtmp1 != src2) {
2207       movdqu(vtmp1, src2);
2208     }
2209     phaddw(vtmp1, vtmp1);
2210     phaddw(vtmp1, vtmp1);
2211   } else {
2212     pshufd(vtmp2, src2, 0x1);
2213     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2214     movdqu(vtmp1, vtmp2);
2215     psrldq(vtmp1, 2);
2216     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2217   }
2218   movdl(vtmp2, src1);
2219   if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) {
2220     pmovzxwd(vtmp1, vtmp1);
2221   } else {
2222     pmovsxwd(vtmp1, vtmp1);
2223   }
2224   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2225   pextrw(dst, vtmp1, 0x0);
2226   movswl(dst, dst);
2227 }
2228 
2229 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2230   if (opcode == Op_AddReductionVI) {
2231     if (vtmp1 != src2) {
2232       movdqu(vtmp1, src2);
2233     }
2234     phaddw(vtmp1, src2);
2235   } else {
2236     pshufd(vtmp1, src2, 0xE);
2237     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2238   }
2239   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2240 }
2241 
2242 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2243   if (opcode == Op_AddReductionVI) {
2244     int vector_len = Assembler::AVX_256bit;
2245     vphaddw(vtmp2, src2, src2, vector_len);
2246     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2247   } else {
2248     vextracti128_high(vtmp2, src2);
2249     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2250   }
2251   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2252 }
2253 
2254 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2255   int vector_len = Assembler::AVX_256bit;
2256   vextracti64x4_high(vtmp1, src2);
2257   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2258   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2259 }
2260 
2261 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2262   pshufd(vtmp2, src2, 0xE);
2263   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2264   movdq(vtmp1, src1);
2265   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2266   movdq(dst, vtmp1);
2267 }
2268 
2269 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2270   vextracti128_high(vtmp1, src2);
2271   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2272   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2273 }
2274 
2275 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2276   vextracti64x4_high(vtmp2, src2);
2277   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2278   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2279 }
2280 
2281 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2282   mov64(temp, -1L);
2283   bzhiq(temp, temp, len);
2284   kmovql(dst, temp);
2285 }
2286 
2287 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2288   reduce_operation_128(T_FLOAT, opcode, dst, src);
2289   pshufd(vtmp, src, 0x1);
2290   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2291 }
2292 
2293 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2294   reduce2F(opcode, dst, src, vtmp);
2295   pshufd(vtmp, src, 0x2);
2296   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2297   pshufd(vtmp, src, 0x3);
2298   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2299 }
2300 
2301 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2302   reduce4F(opcode, dst, src, vtmp2);
2303   vextractf128_high(vtmp2, src);
2304   reduce4F(opcode, dst, vtmp2, vtmp1);
2305 }
2306 
2307 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2308   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2309   vextracti64x4_high(vtmp1, src);
2310   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2311 }
2312 
2313 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2314   pshufd(dst, src, 0x1);
2315   reduce_operation_128(T_FLOAT, opcode, dst, src);
2316 }
2317 
2318 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2319   pshufd(vtmp, src, 0xE);
2320   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2321   unorderedReduce2F(opcode, dst, vtmp);
2322 }
2323 
2324 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2325   vextractf128_high(vtmp1, src);
2326   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2327   unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2328 }
2329 
2330 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2331   vextractf64x4_high(vtmp2, src);
2332   unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2333   unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2334 }
2335 
2336 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2337   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2338   pshufd(vtmp, src, 0xE);
2339   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2340 }
2341 
2342 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2343   reduce2D(opcode, dst, src, vtmp2);
2344   vextractf128_high(vtmp2, src);
2345   reduce2D(opcode, dst, vtmp2, vtmp1);
2346 }
2347 
2348 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2349   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2350   vextracti64x4_high(vtmp1, src);
2351   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2352 }
2353 
2354 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2355   pshufd(dst, src, 0xE);
2356   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2357 }
2358 
2359 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2360   vextractf128_high(vtmp, src);
2361   unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2362   unorderedReduce2D(opcode, dst, vtmp);
2363 }
2364 
2365 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2366   vextractf64x4_high(vtmp2, src);
2367   unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2368   unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2369 }
2370 
2371 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2372   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2373 }
2374 
2375 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2376   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2377 }
2378 
2379 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2380   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2381 }
2382 
2383 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2384                                  int vec_enc) {
2385   switch(elem_bt) {
2386     case T_INT:
2387     case T_FLOAT:
2388       vmaskmovps(dst, src, mask, vec_enc);
2389       break;
2390     case T_LONG:
2391     case T_DOUBLE:
2392       vmaskmovpd(dst, src, mask, vec_enc);
2393       break;
2394     default:
2395       fatal("Unsupported type %s", type2name(elem_bt));
2396       break;
2397   }
2398 }
2399 
2400 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2401                                  int vec_enc) {
2402   switch(elem_bt) {
2403     case T_INT:
2404     case T_FLOAT:
2405       vmaskmovps(dst, src, mask, vec_enc);
2406       break;
2407     case T_LONG:
2408     case T_DOUBLE:
2409       vmaskmovpd(dst, src, mask, vec_enc);
2410       break;
2411     default:
2412       fatal("Unsupported type %s", type2name(elem_bt));
2413       break;
2414   }
2415 }
2416 
2417 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2418                                           XMMRegister dst, XMMRegister src,
2419                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2420                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2421   const int permconst[] = {1, 14};
2422   XMMRegister wsrc = src;
2423   XMMRegister wdst = xmm_0;
2424   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2425 
2426   int vlen_enc = Assembler::AVX_128bit;
2427   if (vlen == 16) {
2428     vlen_enc = Assembler::AVX_256bit;
2429   }
2430 
2431   for (int i = log2(vlen) - 1; i >=0; i--) {
2432     if (i == 0 && !is_dst_valid) {
2433       wdst = dst;
2434     }
2435     if (i == 3) {
2436       vextracti64x4_high(wtmp, wsrc);
2437     } else if (i == 2) {
2438       vextracti128_high(wtmp, wsrc);
2439     } else { // i = [0,1]
2440       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2441     }
2442 
2443     if (VM_Version::supports_avx10_2()) {
2444       vminmax_fp(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc);
2445     } else {
2446       vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2447     }
2448     wsrc = wdst;
2449     vlen_enc = Assembler::AVX_128bit;
2450   }
2451   if (is_dst_valid) {
2452     if (VM_Version::supports_avx10_2()) {
2453       vminmax_fp(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit);
2454     } else {
2455       vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2456     }
2457   }
2458 }
2459 
2460 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2461                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2462                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2463   XMMRegister wsrc = src;
2464   XMMRegister wdst = xmm_0;
2465   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2466   int vlen_enc = Assembler::AVX_128bit;
2467   if (vlen == 8) {
2468     vlen_enc = Assembler::AVX_256bit;
2469   }
2470   for (int i = log2(vlen) - 1; i >=0; i--) {
2471     if (i == 0 && !is_dst_valid) {
2472       wdst = dst;
2473     }
2474     if (i == 1) {
2475       vextracti128_high(wtmp, wsrc);
2476     } else if (i == 2) {
2477       vextracti64x4_high(wtmp, wsrc);
2478     } else {
2479       assert(i == 0, "%d", i);
2480       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2481     }
2482 
2483     if (VM_Version::supports_avx10_2()) {
2484       vminmax_fp(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc);
2485     } else {
2486       vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2487     }
2488 
2489     wsrc = wdst;
2490     vlen_enc = Assembler::AVX_128bit;
2491   }
2492 
2493   if (is_dst_valid) {
2494     if (VM_Version::supports_avx10_2()) {
2495       vminmax_fp(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit);
2496     } else {
2497       vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2498     }
2499   }
2500 }
2501 
2502 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2503   switch (bt) {
2504     case T_BYTE:  pextrb(dst, src, idx); break;
2505     case T_SHORT: pextrw(dst, src, idx); break;
2506     case T_INT:   pextrd(dst, src, idx); break;
2507     case T_LONG:  pextrq(dst, src, idx); break;
2508 
2509     default:
2510       assert(false,"Should not reach here.");
2511       break;
2512   }
2513 }
2514 
2515 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2516   int esize =  type2aelembytes(typ);
2517   int elem_per_lane = 16/esize;
2518   int lane = elemindex / elem_per_lane;
2519   int eindex = elemindex % elem_per_lane;
2520 
2521   if (lane >= 2) {
2522     assert(UseAVX > 2, "required");
2523     vextractf32x4(dst, src, lane & 3);
2524     return dst;
2525   } else if (lane > 0) {
2526     assert(UseAVX > 0, "required");
2527     vextractf128(dst, src, lane);
2528     return dst;
2529   } else {
2530     return src;
2531   }
2532 }
2533 
2534 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2535   if (typ == T_BYTE) {
2536     movsbl(dst, dst);
2537   } else if (typ == T_SHORT) {
2538     movswl(dst, dst);
2539   }
2540 }
2541 
2542 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2543   int esize =  type2aelembytes(typ);
2544   int elem_per_lane = 16/esize;
2545   int eindex = elemindex % elem_per_lane;
2546   assert(is_integral_type(typ),"required");
2547 
2548   if (eindex == 0) {
2549     if (typ == T_LONG) {
2550       movq(dst, src);
2551     } else {
2552       movdl(dst, src);
2553       movsxl(typ, dst);
2554     }
2555   } else {
2556     extract(typ, dst, src, eindex);
2557     movsxl(typ, dst);
2558   }
2559 }
2560 
2561 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2562   int esize =  type2aelembytes(typ);
2563   int elem_per_lane = 16/esize;
2564   int eindex = elemindex % elem_per_lane;
2565   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2566 
2567   if (eindex == 0) {
2568     movq(dst, src);
2569   } else {
2570     if (typ == T_FLOAT) {
2571       if (UseAVX == 0) {
2572         movdqu(dst, src);
2573         shufps(dst, dst, eindex);
2574       } else {
2575         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2576       }
2577     } else {
2578       if (UseAVX == 0) {
2579         movdqu(dst, src);
2580         psrldq(dst, eindex*esize);
2581       } else {
2582         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2583       }
2584       movq(dst, dst);
2585     }
2586   }
2587   // Zero upper bits
2588   if (typ == T_FLOAT) {
2589     if (UseAVX == 0) {
2590       assert(vtmp != xnoreg, "required.");
2591       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2592       pand(dst, vtmp);
2593     } else {
2594       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2595     }
2596   }
2597 }
2598 
2599 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2600   switch(typ) {
2601     case T_BYTE:
2602     case T_BOOLEAN:
2603       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2604       break;
2605     case T_SHORT:
2606     case T_CHAR:
2607       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2608       break;
2609     case T_INT:
2610     case T_FLOAT:
2611       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2612       break;
2613     case T_LONG:
2614     case T_DOUBLE:
2615       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2616       break;
2617     default:
2618       assert(false,"Should not reach here.");
2619       break;
2620   }
2621 }
2622 
2623 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2624   assert(rscratch != noreg || always_reachable(src2), "missing");
2625 
2626   switch(typ) {
2627     case T_BOOLEAN:
2628     case T_BYTE:
2629       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2630       break;
2631     case T_CHAR:
2632     case T_SHORT:
2633       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2634       break;
2635     case T_INT:
2636     case T_FLOAT:
2637       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2638       break;
2639     case T_LONG:
2640     case T_DOUBLE:
2641       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2642       break;
2643     default:
2644       assert(false,"Should not reach here.");
2645       break;
2646   }
2647 }
2648 
2649 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2650   switch(typ) {
2651     case T_BYTE:
2652       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2653       break;
2654     case T_SHORT:
2655       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2656       break;
2657     case T_INT:
2658     case T_FLOAT:
2659       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2660       break;
2661     case T_LONG:
2662     case T_DOUBLE:
2663       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2664       break;
2665     default:
2666       assert(false,"Should not reach here.");
2667       break;
2668   }
2669 }
2670 
2671 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2672   assert(vlen_in_bytes <= 32, "");
2673   int esize = type2aelembytes(bt);
2674   if (vlen_in_bytes == 32) {
2675     assert(vtmp == xnoreg, "required.");
2676     if (esize >= 4) {
2677       vtestps(src1, src2, AVX_256bit);
2678     } else {
2679       vptest(src1, src2, AVX_256bit);
2680     }
2681     return;
2682   }
2683   if (vlen_in_bytes < 16) {
2684     // Duplicate the lower part to fill the whole register,
2685     // Don't need to do so for src2
2686     assert(vtmp != xnoreg, "required");
2687     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2688     pshufd(vtmp, src1, shuffle_imm);
2689   } else {
2690     assert(vtmp == xnoreg, "required");
2691     vtmp = src1;
2692   }
2693   if (esize >= 4 && VM_Version::supports_avx()) {
2694     vtestps(vtmp, src2, AVX_128bit);
2695   } else {
2696     ptest(vtmp, src2);
2697   }
2698 }
2699 
2700 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2701 #ifdef ASSERT
2702   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2703   bool is_bw_supported = VM_Version::supports_avx512bw();
2704   if (is_bw && !is_bw_supported) {
2705     assert(vlen_enc != Assembler::AVX_512bit, "required");
2706     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2707            "XMM register should be 0-15");
2708   }
2709 #endif // ASSERT
2710   switch (elem_bt) {
2711     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2712     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2713     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2714     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2715     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2716     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2717     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2718   }
2719 }
2720 
2721 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2722   assert(UseAVX >= 2, "required");
2723   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2724   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2725   if ((UseAVX > 2) &&
2726       (!is_bw || VM_Version::supports_avx512bw()) &&
2727       (!is_vl || VM_Version::supports_avx512vl())) {
2728     switch (elem_bt) {
2729       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2730       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2731       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2732       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2733       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2734     }
2735   } else {
2736     assert(vlen_enc != Assembler::AVX_512bit, "required");
2737     assert((dst->encoding() < 16),"XMM register should be 0-15");
2738     switch (elem_bt) {
2739       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2740       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2741       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2742       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2743       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2744       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2745       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2746     }
2747   }
2748 }
2749 
2750 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2751   switch (to_elem_bt) {
2752     case T_SHORT:
2753       vpmovsxbw(dst, src, vlen_enc);
2754       break;
2755     case T_INT:
2756       vpmovsxbd(dst, src, vlen_enc);
2757       break;
2758     case T_FLOAT:
2759       vpmovsxbd(dst, src, vlen_enc);
2760       vcvtdq2ps(dst, dst, vlen_enc);
2761       break;
2762     case T_LONG:
2763       vpmovsxbq(dst, src, vlen_enc);
2764       break;
2765     case T_DOUBLE: {
2766       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2767       vpmovsxbd(dst, src, mid_vlen_enc);
2768       vcvtdq2pd(dst, dst, vlen_enc);
2769       break;
2770     }
2771     default:
2772       fatal("Unsupported type %s", type2name(to_elem_bt));
2773       break;
2774   }
2775 }
2776 
2777 //-------------------------------------------------------------------------------------------
2778 
2779 // IndexOf for constant substrings with size >= 8 chars
2780 // which don't need to be loaded through stack.
2781 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2782                                          Register cnt1, Register cnt2,
2783                                          int int_cnt2,  Register result,
2784                                          XMMRegister vec, Register tmp,
2785                                          int ae) {
2786   ShortBranchVerifier sbv(this);
2787   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2788   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2789 
2790   // This method uses the pcmpestri instruction with bound registers
2791   //   inputs:
2792   //     xmm - substring
2793   //     rax - substring length (elements count)
2794   //     mem - scanned string
2795   //     rdx - string length (elements count)
2796   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2797   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2798   //   outputs:
2799   //     rcx - matched index in string
2800   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2801   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2802   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2803   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2804   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2805 
2806   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2807         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2808         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2809 
2810   // Note, inline_string_indexOf() generates checks:
2811   // if (substr.count > string.count) return -1;
2812   // if (substr.count == 0) return 0;
2813   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2814 
2815   // Load substring.
2816   if (ae == StrIntrinsicNode::UL) {
2817     pmovzxbw(vec, Address(str2, 0));
2818   } else {
2819     movdqu(vec, Address(str2, 0));
2820   }
2821   movl(cnt2, int_cnt2);
2822   movptr(result, str1); // string addr
2823 
2824   if (int_cnt2 > stride) {
2825     jmpb(SCAN_TO_SUBSTR);
2826 
2827     // Reload substr for rescan, this code
2828     // is executed only for large substrings (> 8 chars)
2829     bind(RELOAD_SUBSTR);
2830     if (ae == StrIntrinsicNode::UL) {
2831       pmovzxbw(vec, Address(str2, 0));
2832     } else {
2833       movdqu(vec, Address(str2, 0));
2834     }
2835     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2836 
2837     bind(RELOAD_STR);
2838     // We came here after the beginning of the substring was
2839     // matched but the rest of it was not so we need to search
2840     // again. Start from the next element after the previous match.
2841 
2842     // cnt2 is number of substring reminding elements and
2843     // cnt1 is number of string reminding elements when cmp failed.
2844     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2845     subl(cnt1, cnt2);
2846     addl(cnt1, int_cnt2);
2847     movl(cnt2, int_cnt2); // Now restore cnt2
2848 
2849     decrementl(cnt1);     // Shift to next element
2850     cmpl(cnt1, cnt2);
2851     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2852 
2853     addptr(result, (1<<scale1));
2854 
2855   } // (int_cnt2 > 8)
2856 
2857   // Scan string for start of substr in 16-byte vectors
2858   bind(SCAN_TO_SUBSTR);
2859   pcmpestri(vec, Address(result, 0), mode);
2860   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2861   subl(cnt1, stride);
2862   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2863   cmpl(cnt1, cnt2);
2864   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2865   addptr(result, 16);
2866   jmpb(SCAN_TO_SUBSTR);
2867 
2868   // Found a potential substr
2869   bind(FOUND_CANDIDATE);
2870   // Matched whole vector if first element matched (tmp(rcx) == 0).
2871   if (int_cnt2 == stride) {
2872     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2873   } else { // int_cnt2 > 8
2874     jccb(Assembler::overflow, FOUND_SUBSTR);
2875   }
2876   // After pcmpestri tmp(rcx) contains matched element index
2877   // Compute start addr of substr
2878   lea(result, Address(result, tmp, scale1));
2879 
2880   // Make sure string is still long enough
2881   subl(cnt1, tmp);
2882   cmpl(cnt1, cnt2);
2883   if (int_cnt2 == stride) {
2884     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2885   } else { // int_cnt2 > 8
2886     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2887   }
2888   // Left less then substring.
2889 
2890   bind(RET_NOT_FOUND);
2891   movl(result, -1);
2892   jmp(EXIT);
2893 
2894   if (int_cnt2 > stride) {
2895     // This code is optimized for the case when whole substring
2896     // is matched if its head is matched.
2897     bind(MATCH_SUBSTR_HEAD);
2898     pcmpestri(vec, Address(result, 0), mode);
2899     // Reload only string if does not match
2900     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2901 
2902     Label CONT_SCAN_SUBSTR;
2903     // Compare the rest of substring (> 8 chars).
2904     bind(FOUND_SUBSTR);
2905     // First 8 chars are already matched.
2906     negptr(cnt2);
2907     addptr(cnt2, stride);
2908 
2909     bind(SCAN_SUBSTR);
2910     subl(cnt1, stride);
2911     cmpl(cnt2, -stride); // Do not read beyond substring
2912     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2913     // Back-up strings to avoid reading beyond substring:
2914     // cnt1 = cnt1 - cnt2 + 8
2915     addl(cnt1, cnt2); // cnt2 is negative
2916     addl(cnt1, stride);
2917     movl(cnt2, stride); negptr(cnt2);
2918     bind(CONT_SCAN_SUBSTR);
2919     if (int_cnt2 < (int)G) {
2920       int tail_off1 = int_cnt2<<scale1;
2921       int tail_off2 = int_cnt2<<scale2;
2922       if (ae == StrIntrinsicNode::UL) {
2923         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2924       } else {
2925         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2926       }
2927       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2928     } else {
2929       // calculate index in register to avoid integer overflow (int_cnt2*2)
2930       movl(tmp, int_cnt2);
2931       addptr(tmp, cnt2);
2932       if (ae == StrIntrinsicNode::UL) {
2933         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2934       } else {
2935         movdqu(vec, Address(str2, tmp, scale2, 0));
2936       }
2937       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2938     }
2939     // Need to reload strings pointers if not matched whole vector
2940     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2941     addptr(cnt2, stride);
2942     jcc(Assembler::negative, SCAN_SUBSTR);
2943     // Fall through if found full substring
2944 
2945   } // (int_cnt2 > 8)
2946 
2947   bind(RET_FOUND);
2948   // Found result if we matched full small substring.
2949   // Compute substr offset
2950   subptr(result, str1);
2951   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2952     shrl(result, 1); // index
2953   }
2954   bind(EXIT);
2955 
2956 } // string_indexofC8
2957 
2958 // Small strings are loaded through stack if they cross page boundary.
2959 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2960                                        Register cnt1, Register cnt2,
2961                                        int int_cnt2,  Register result,
2962                                        XMMRegister vec, Register tmp,
2963                                        int ae) {
2964   ShortBranchVerifier sbv(this);
2965   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2966   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2967 
2968   //
2969   // int_cnt2 is length of small (< 8 chars) constant substring
2970   // or (-1) for non constant substring in which case its length
2971   // is in cnt2 register.
2972   //
2973   // Note, inline_string_indexOf() generates checks:
2974   // if (substr.count > string.count) return -1;
2975   // if (substr.count == 0) return 0;
2976   //
2977   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2978   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2979   // This method uses the pcmpestri instruction with bound registers
2980   //   inputs:
2981   //     xmm - substring
2982   //     rax - substring length (elements count)
2983   //     mem - scanned string
2984   //     rdx - string length (elements count)
2985   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2986   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2987   //   outputs:
2988   //     rcx - matched index in string
2989   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2990   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2991   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2992   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2993 
2994   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2995         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2996         FOUND_CANDIDATE;
2997 
2998   { //========================================================
2999     // We don't know where these strings are located
3000     // and we can't read beyond them. Load them through stack.
3001     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
3002 
3003     movptr(tmp, rsp); // save old SP
3004 
3005     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
3006       if (int_cnt2 == (1>>scale2)) { // One byte
3007         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
3008         load_unsigned_byte(result, Address(str2, 0));
3009         movdl(vec, result); // move 32 bits
3010       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
3011         // Not enough header space in 32-bit VM: 12+3 = 15.
3012         movl(result, Address(str2, -1));
3013         shrl(result, 8);
3014         movdl(vec, result); // move 32 bits
3015       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
3016         load_unsigned_short(result, Address(str2, 0));
3017         movdl(vec, result); // move 32 bits
3018       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
3019         movdl(vec, Address(str2, 0)); // move 32 bits
3020       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
3021         movq(vec, Address(str2, 0));  // move 64 bits
3022       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
3023         // Array header size is 12 bytes in 32-bit VM
3024         // + 6 bytes for 3 chars == 18 bytes,
3025         // enough space to load vec and shift.
3026         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
3027         if (ae == StrIntrinsicNode::UL) {
3028           int tail_off = int_cnt2-8;
3029           pmovzxbw(vec, Address(str2, tail_off));
3030           psrldq(vec, -2*tail_off);
3031         }
3032         else {
3033           int tail_off = int_cnt2*(1<<scale2);
3034           movdqu(vec, Address(str2, tail_off-16));
3035           psrldq(vec, 16-tail_off);
3036         }
3037       }
3038     } else { // not constant substring
3039       cmpl(cnt2, stride);
3040       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3041 
3042       // We can read beyond string if srt+16 does not cross page boundary
3043       // since heaps are aligned and mapped by pages.
3044       assert(os::vm_page_size() < (int)G, "default page should be small");
3045       movl(result, str2); // We need only low 32 bits
3046       andl(result, ((int)os::vm_page_size()-1));
3047       cmpl(result, ((int)os::vm_page_size()-16));
3048       jccb(Assembler::belowEqual, CHECK_STR);
3049 
3050       // Move small strings to stack to allow load 16 bytes into vec.
3051       subptr(rsp, 16);
3052       int stk_offset = wordSize-(1<<scale2);
3053       push(cnt2);
3054 
3055       bind(COPY_SUBSTR);
3056       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3057         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3058         movb(Address(rsp, cnt2, scale2, stk_offset), result);
3059       } else if (ae == StrIntrinsicNode::UU) {
3060         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3061         movw(Address(rsp, cnt2, scale2, stk_offset), result);
3062       }
3063       decrement(cnt2);
3064       jccb(Assembler::notZero, COPY_SUBSTR);
3065 
3066       pop(cnt2);
3067       movptr(str2, rsp);  // New substring address
3068     } // non constant
3069 
3070     bind(CHECK_STR);
3071     cmpl(cnt1, stride);
3072     jccb(Assembler::aboveEqual, BIG_STRINGS);
3073 
3074     // Check cross page boundary.
3075     movl(result, str1); // We need only low 32 bits
3076     andl(result, ((int)os::vm_page_size()-1));
3077     cmpl(result, ((int)os::vm_page_size()-16));
3078     jccb(Assembler::belowEqual, BIG_STRINGS);
3079 
3080     subptr(rsp, 16);
3081     int stk_offset = -(1<<scale1);
3082     if (int_cnt2 < 0) { // not constant
3083       push(cnt2);
3084       stk_offset += wordSize;
3085     }
3086     movl(cnt2, cnt1);
3087 
3088     bind(COPY_STR);
3089     if (ae == StrIntrinsicNode::LL) {
3090       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3091       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3092     } else {
3093       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3094       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3095     }
3096     decrement(cnt2);
3097     jccb(Assembler::notZero, COPY_STR);
3098 
3099     if (int_cnt2 < 0) { // not constant
3100       pop(cnt2);
3101     }
3102     movptr(str1, rsp);  // New string address
3103 
3104     bind(BIG_STRINGS);
3105     // Load substring.
3106     if (int_cnt2 < 0) { // -1
3107       if (ae == StrIntrinsicNode::UL) {
3108         pmovzxbw(vec, Address(str2, 0));
3109       } else {
3110         movdqu(vec, Address(str2, 0));
3111       }
3112       push(cnt2);       // substr count
3113       push(str2);       // substr addr
3114       push(str1);       // string addr
3115     } else {
3116       // Small (< 8 chars) constant substrings are loaded already.
3117       movl(cnt2, int_cnt2);
3118     }
3119     push(tmp);  // original SP
3120 
3121   } // Finished loading
3122 
3123   //========================================================
3124   // Start search
3125   //
3126 
3127   movptr(result, str1); // string addr
3128 
3129   if (int_cnt2  < 0) {  // Only for non constant substring
3130     jmpb(SCAN_TO_SUBSTR);
3131 
3132     // SP saved at sp+0
3133     // String saved at sp+1*wordSize
3134     // Substr saved at sp+2*wordSize
3135     // Substr count saved at sp+3*wordSize
3136 
3137     // Reload substr for rescan, this code
3138     // is executed only for large substrings (> 8 chars)
3139     bind(RELOAD_SUBSTR);
3140     movptr(str2, Address(rsp, 2*wordSize));
3141     movl(cnt2, Address(rsp, 3*wordSize));
3142     if (ae == StrIntrinsicNode::UL) {
3143       pmovzxbw(vec, Address(str2, 0));
3144     } else {
3145       movdqu(vec, Address(str2, 0));
3146     }
3147     // We came here after the beginning of the substring was
3148     // matched but the rest of it was not so we need to search
3149     // again. Start from the next element after the previous match.
3150     subptr(str1, result); // Restore counter
3151     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3152       shrl(str1, 1);
3153     }
3154     addl(cnt1, str1);
3155     decrementl(cnt1);   // Shift to next element
3156     cmpl(cnt1, cnt2);
3157     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3158 
3159     addptr(result, (1<<scale1));
3160   } // non constant
3161 
3162   // Scan string for start of substr in 16-byte vectors
3163   bind(SCAN_TO_SUBSTR);
3164   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3165   pcmpestri(vec, Address(result, 0), mode);
3166   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3167   subl(cnt1, stride);
3168   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3169   cmpl(cnt1, cnt2);
3170   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3171   addptr(result, 16);
3172 
3173   bind(ADJUST_STR);
3174   cmpl(cnt1, stride); // Do not read beyond string
3175   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3176   // Back-up string to avoid reading beyond string.
3177   lea(result, Address(result, cnt1, scale1, -16));
3178   movl(cnt1, stride);
3179   jmpb(SCAN_TO_SUBSTR);
3180 
3181   // Found a potential substr
3182   bind(FOUND_CANDIDATE);
3183   // After pcmpestri tmp(rcx) contains matched element index
3184 
3185   // Make sure string is still long enough
3186   subl(cnt1, tmp);
3187   cmpl(cnt1, cnt2);
3188   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3189   // Left less then substring.
3190 
3191   bind(RET_NOT_FOUND);
3192   movl(result, -1);
3193   jmp(CLEANUP);
3194 
3195   bind(FOUND_SUBSTR);
3196   // Compute start addr of substr
3197   lea(result, Address(result, tmp, scale1));
3198   if (int_cnt2 > 0) { // Constant substring
3199     // Repeat search for small substring (< 8 chars)
3200     // from new point without reloading substring.
3201     // Have to check that we don't read beyond string.
3202     cmpl(tmp, stride-int_cnt2);
3203     jccb(Assembler::greater, ADJUST_STR);
3204     // Fall through if matched whole substring.
3205   } else { // non constant
3206     assert(int_cnt2 == -1, "should be != 0");
3207 
3208     addl(tmp, cnt2);
3209     // Found result if we matched whole substring.
3210     cmpl(tmp, stride);
3211     jcc(Assembler::lessEqual, RET_FOUND);
3212 
3213     // Repeat search for small substring (<= 8 chars)
3214     // from new point 'str1' without reloading substring.
3215     cmpl(cnt2, stride);
3216     // Have to check that we don't read beyond string.
3217     jccb(Assembler::lessEqual, ADJUST_STR);
3218 
3219     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3220     // Compare the rest of substring (> 8 chars).
3221     movptr(str1, result);
3222 
3223     cmpl(tmp, cnt2);
3224     // First 8 chars are already matched.
3225     jccb(Assembler::equal, CHECK_NEXT);
3226 
3227     bind(SCAN_SUBSTR);
3228     pcmpestri(vec, Address(str1, 0), mode);
3229     // Need to reload strings pointers if not matched whole vector
3230     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3231 
3232     bind(CHECK_NEXT);
3233     subl(cnt2, stride);
3234     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3235     addptr(str1, 16);
3236     if (ae == StrIntrinsicNode::UL) {
3237       addptr(str2, 8);
3238     } else {
3239       addptr(str2, 16);
3240     }
3241     subl(cnt1, stride);
3242     cmpl(cnt2, stride); // Do not read beyond substring
3243     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3244     // Back-up strings to avoid reading beyond substring.
3245 
3246     if (ae == StrIntrinsicNode::UL) {
3247       lea(str2, Address(str2, cnt2, scale2, -8));
3248       lea(str1, Address(str1, cnt2, scale1, -16));
3249     } else {
3250       lea(str2, Address(str2, cnt2, scale2, -16));
3251       lea(str1, Address(str1, cnt2, scale1, -16));
3252     }
3253     subl(cnt1, cnt2);
3254     movl(cnt2, stride);
3255     addl(cnt1, stride);
3256     bind(CONT_SCAN_SUBSTR);
3257     if (ae == StrIntrinsicNode::UL) {
3258       pmovzxbw(vec, Address(str2, 0));
3259     } else {
3260       movdqu(vec, Address(str2, 0));
3261     }
3262     jmp(SCAN_SUBSTR);
3263 
3264     bind(RET_FOUND_LONG);
3265     movptr(str1, Address(rsp, wordSize));
3266   } // non constant
3267 
3268   bind(RET_FOUND);
3269   // Compute substr offset
3270   subptr(result, str1);
3271   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3272     shrl(result, 1); // index
3273   }
3274   bind(CLEANUP);
3275   pop(rsp); // restore SP
3276 
3277 } // string_indexof
3278 
3279 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3280                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3281   ShortBranchVerifier sbv(this);
3282   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3283 
3284   int stride = 8;
3285 
3286   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3287         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3288         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3289         FOUND_SEQ_CHAR, DONE_LABEL;
3290 
3291   movptr(result, str1);
3292   if (UseAVX >= 2) {
3293     cmpl(cnt1, stride);
3294     jcc(Assembler::less, SCAN_TO_CHAR);
3295     cmpl(cnt1, 2*stride);
3296     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3297     movdl(vec1, ch);
3298     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3299     vpxor(vec2, vec2);
3300     movl(tmp, cnt1);
3301     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3302     andl(cnt1,0x0000000F);  //tail count (in chars)
3303 
3304     bind(SCAN_TO_16_CHAR_LOOP);
3305     vmovdqu(vec3, Address(result, 0));
3306     vpcmpeqw(vec3, vec3, vec1, 1);
3307     vptest(vec2, vec3);
3308     jcc(Assembler::carryClear, FOUND_CHAR);
3309     addptr(result, 32);
3310     subl(tmp, 2*stride);
3311     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3312     jmp(SCAN_TO_8_CHAR);
3313     bind(SCAN_TO_8_CHAR_INIT);
3314     movdl(vec1, ch);
3315     pshuflw(vec1, vec1, 0x00);
3316     pshufd(vec1, vec1, 0);
3317     pxor(vec2, vec2);
3318   }
3319   bind(SCAN_TO_8_CHAR);
3320   cmpl(cnt1, stride);
3321   jcc(Assembler::less, SCAN_TO_CHAR);
3322   if (UseAVX < 2) {
3323     movdl(vec1, ch);
3324     pshuflw(vec1, vec1, 0x00);
3325     pshufd(vec1, vec1, 0);
3326     pxor(vec2, vec2);
3327   }
3328   movl(tmp, cnt1);
3329   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3330   andl(cnt1,0x00000007);  //tail count (in chars)
3331 
3332   bind(SCAN_TO_8_CHAR_LOOP);
3333   movdqu(vec3, Address(result, 0));
3334   pcmpeqw(vec3, vec1);
3335   ptest(vec2, vec3);
3336   jcc(Assembler::carryClear, FOUND_CHAR);
3337   addptr(result, 16);
3338   subl(tmp, stride);
3339   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3340   bind(SCAN_TO_CHAR);
3341   testl(cnt1, cnt1);
3342   jcc(Assembler::zero, RET_NOT_FOUND);
3343   bind(SCAN_TO_CHAR_LOOP);
3344   load_unsigned_short(tmp, Address(result, 0));
3345   cmpl(ch, tmp);
3346   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3347   addptr(result, 2);
3348   subl(cnt1, 1);
3349   jccb(Assembler::zero, RET_NOT_FOUND);
3350   jmp(SCAN_TO_CHAR_LOOP);
3351 
3352   bind(RET_NOT_FOUND);
3353   movl(result, -1);
3354   jmpb(DONE_LABEL);
3355 
3356   bind(FOUND_CHAR);
3357   if (UseAVX >= 2) {
3358     vpmovmskb(tmp, vec3);
3359   } else {
3360     pmovmskb(tmp, vec3);
3361   }
3362   bsfl(ch, tmp);
3363   addptr(result, ch);
3364 
3365   bind(FOUND_SEQ_CHAR);
3366   subptr(result, str1);
3367   shrl(result, 1);
3368 
3369   bind(DONE_LABEL);
3370 } // string_indexof_char
3371 
3372 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3373                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3374   ShortBranchVerifier sbv(this);
3375   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3376 
3377   int stride = 16;
3378 
3379   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3380         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3381         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3382         FOUND_SEQ_CHAR, DONE_LABEL;
3383 
3384   movptr(result, str1);
3385   if (UseAVX >= 2) {
3386     cmpl(cnt1, stride);
3387     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3388     cmpl(cnt1, stride*2);
3389     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3390     movdl(vec1, ch);
3391     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3392     vpxor(vec2, vec2);
3393     movl(tmp, cnt1);
3394     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3395     andl(cnt1,0x0000001F);  //tail count (in chars)
3396 
3397     bind(SCAN_TO_32_CHAR_LOOP);
3398     vmovdqu(vec3, Address(result, 0));
3399     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3400     vptest(vec2, vec3);
3401     jcc(Assembler::carryClear, FOUND_CHAR);
3402     addptr(result, 32);
3403     subl(tmp, stride*2);
3404     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3405     jmp(SCAN_TO_16_CHAR);
3406 
3407     bind(SCAN_TO_16_CHAR_INIT);
3408     movdl(vec1, ch);
3409     pxor(vec2, vec2);
3410     pshufb(vec1, vec2);
3411   }
3412 
3413   bind(SCAN_TO_16_CHAR);
3414   cmpl(cnt1, stride);
3415   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3416   if (UseAVX < 2) {
3417     movdl(vec1, ch);
3418     pxor(vec2, vec2);
3419     pshufb(vec1, vec2);
3420   }
3421   movl(tmp, cnt1);
3422   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3423   andl(cnt1,0x0000000F);  //tail count (in bytes)
3424 
3425   bind(SCAN_TO_16_CHAR_LOOP);
3426   movdqu(vec3, Address(result, 0));
3427   pcmpeqb(vec3, vec1);
3428   ptest(vec2, vec3);
3429   jcc(Assembler::carryClear, FOUND_CHAR);
3430   addptr(result, 16);
3431   subl(tmp, stride);
3432   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3433 
3434   bind(SCAN_TO_CHAR_INIT);
3435   testl(cnt1, cnt1);
3436   jcc(Assembler::zero, RET_NOT_FOUND);
3437   bind(SCAN_TO_CHAR_LOOP);
3438   load_unsigned_byte(tmp, Address(result, 0));
3439   cmpl(ch, tmp);
3440   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3441   addptr(result, 1);
3442   subl(cnt1, 1);
3443   jccb(Assembler::zero, RET_NOT_FOUND);
3444   jmp(SCAN_TO_CHAR_LOOP);
3445 
3446   bind(RET_NOT_FOUND);
3447   movl(result, -1);
3448   jmpb(DONE_LABEL);
3449 
3450   bind(FOUND_CHAR);
3451   if (UseAVX >= 2) {
3452     vpmovmskb(tmp, vec3);
3453   } else {
3454     pmovmskb(tmp, vec3);
3455   }
3456   bsfl(ch, tmp);
3457   addptr(result, ch);
3458 
3459   bind(FOUND_SEQ_CHAR);
3460   subptr(result, str1);
3461 
3462   bind(DONE_LABEL);
3463 } // stringL_indexof_char
3464 
3465 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3466   switch (eltype) {
3467   case T_BOOLEAN: return sizeof(jboolean);
3468   case T_BYTE:  return sizeof(jbyte);
3469   case T_SHORT: return sizeof(jshort);
3470   case T_CHAR:  return sizeof(jchar);
3471   case T_INT:   return sizeof(jint);
3472   default:
3473     ShouldNotReachHere();
3474     return -1;
3475   }
3476 }
3477 
3478 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3479   switch (eltype) {
3480   // T_BOOLEAN used as surrogate for unsigned byte
3481   case T_BOOLEAN: movzbl(dst, src);   break;
3482   case T_BYTE:    movsbl(dst, src);   break;
3483   case T_SHORT:   movswl(dst, src);   break;
3484   case T_CHAR:    movzwl(dst, src);   break;
3485   case T_INT:     movl(dst, src);     break;
3486   default:
3487     ShouldNotReachHere();
3488   }
3489 }
3490 
3491 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3492   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3493 }
3494 
3495 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3496   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3497 }
3498 
3499 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3500   const int vlen = Assembler::AVX_256bit;
3501   switch (eltype) {
3502   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3503   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3504   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3505   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3506   case T_INT:
3507     // do nothing
3508     break;
3509   default:
3510     ShouldNotReachHere();
3511   }
3512 }
3513 
3514 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3515                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3516                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3517                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3518                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3519                                         BasicType eltype) {
3520   ShortBranchVerifier sbv(this);
3521   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3522   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3523   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3524 
3525   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3526         SHORT_UNROLLED_LOOP_EXIT,
3527         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3528         UNROLLED_VECTOR_LOOP_BEGIN,
3529         END;
3530   switch (eltype) {
3531   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3532   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3533   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3534   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3535   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3536   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3537   }
3538 
3539   // For "renaming" for readibility of the code
3540   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3541                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3542                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3543 
3544   const int elsize = arrays_hashcode_elsize(eltype);
3545 
3546   /*
3547     if (cnt1 >= 2) {
3548       if (cnt1 >= 32) {
3549         UNROLLED VECTOR LOOP
3550       }
3551       UNROLLED SCALAR LOOP
3552     }
3553     SINGLE SCALAR
3554    */
3555 
3556   cmpl(cnt1, 32);
3557   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3558 
3559   // cnt1 >= 32 && generate_vectorized_loop
3560   xorl(index, index);
3561 
3562   // vresult = IntVector.zero(I256);
3563   for (int idx = 0; idx < 4; idx++) {
3564     vpxor(vresult[idx], vresult[idx]);
3565   }
3566   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3567   Register bound = tmp2;
3568   Register next = tmp3;
3569   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3570   movl(next, Address(tmp2, 0));
3571   movdl(vnext, next);
3572   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3573 
3574   // index = 0;
3575   // bound = cnt1 & ~(32 - 1);
3576   movl(bound, cnt1);
3577   andl(bound, ~(32 - 1));
3578   // for (; index < bound; index += 32) {
3579   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3580   // result *= next;
3581   imull(result, next);
3582   // loop fission to upfront the cost of fetching from memory, OOO execution
3583   // can then hopefully do a better job of prefetching
3584   for (int idx = 0; idx < 4; idx++) {
3585     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3586   }
3587   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3588   for (int idx = 0; idx < 4; idx++) {
3589     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3590     arrays_hashcode_elvcast(vtmp[idx], eltype);
3591     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3592   }
3593   // index += 32;
3594   addl(index, 32);
3595   // index < bound;
3596   cmpl(index, bound);
3597   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3598   // }
3599 
3600   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3601   subl(cnt1, bound);
3602   // release bound
3603 
3604   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3605   for (int idx = 0; idx < 4; idx++) {
3606     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3607     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3608     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3609   }
3610   // result += vresult.reduceLanes(ADD);
3611   for (int idx = 0; idx < 4; idx++) {
3612     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3613   }
3614 
3615   // } else if (cnt1 < 32) {
3616 
3617   bind(SHORT_UNROLLED_BEGIN);
3618   // int i = 1;
3619   movl(index, 1);
3620   cmpl(index, cnt1);
3621   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3622 
3623   // for (; i < cnt1 ; i += 2) {
3624   bind(SHORT_UNROLLED_LOOP_BEGIN);
3625   movl(tmp3, 961);
3626   imull(result, tmp3);
3627   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3628   movl(tmp3, tmp2);
3629   shll(tmp3, 5);
3630   subl(tmp3, tmp2);
3631   addl(result, tmp3);
3632   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3633   addl(result, tmp3);
3634   addl(index, 2);
3635   cmpl(index, cnt1);
3636   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3637 
3638   // }
3639   // if (i >= cnt1) {
3640   bind(SHORT_UNROLLED_LOOP_EXIT);
3641   jccb(Assembler::greater, END);
3642   movl(tmp2, result);
3643   shll(result, 5);
3644   subl(result, tmp2);
3645   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3646   addl(result, tmp3);
3647   // }
3648   bind(END);
3649 
3650   BLOCK_COMMENT("} // arrays_hashcode");
3651 
3652 } // arrays_hashcode
3653 
3654 // helper function for string_compare
3655 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3656                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3657                                            Address::ScaleFactor scale2, Register index, int ae) {
3658   if (ae == StrIntrinsicNode::LL) {
3659     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3660     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3661   } else if (ae == StrIntrinsicNode::UU) {
3662     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3663     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3664   } else {
3665     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3666     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3667   }
3668 }
3669 
3670 // Compare strings, used for char[] and byte[].
3671 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3672                                        Register cnt1, Register cnt2, Register result,
3673                                        XMMRegister vec1, int ae, KRegister mask) {
3674   ShortBranchVerifier sbv(this);
3675   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3676   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only AVX3
3677   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3678   int stride2x2 = 0x40;
3679   Address::ScaleFactor scale = Address::no_scale;
3680   Address::ScaleFactor scale1 = Address::no_scale;
3681   Address::ScaleFactor scale2 = Address::no_scale;
3682 
3683   if (ae != StrIntrinsicNode::LL) {
3684     stride2x2 = 0x20;
3685   }
3686 
3687   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3688     shrl(cnt2, 1);
3689   }
3690   // Compute the minimum of the string lengths and the
3691   // difference of the string lengths (stack).
3692   // Do the conditional move stuff
3693   movl(result, cnt1);
3694   subl(cnt1, cnt2);
3695   push(cnt1);
3696   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3697 
3698   // Is the minimum length zero?
3699   testl(cnt2, cnt2);
3700   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3701   if (ae == StrIntrinsicNode::LL) {
3702     // Load first bytes
3703     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3704     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3705   } else if (ae == StrIntrinsicNode::UU) {
3706     // Load first characters
3707     load_unsigned_short(result, Address(str1, 0));
3708     load_unsigned_short(cnt1, Address(str2, 0));
3709   } else {
3710     load_unsigned_byte(result, Address(str1, 0));
3711     load_unsigned_short(cnt1, Address(str2, 0));
3712   }
3713   subl(result, cnt1);
3714   jcc(Assembler::notZero,  POP_LABEL);
3715 
3716   if (ae == StrIntrinsicNode::UU) {
3717     // Divide length by 2 to get number of chars
3718     shrl(cnt2, 1);
3719   }
3720   cmpl(cnt2, 1);
3721   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3722 
3723   // Check if the strings start at the same location and setup scale and stride
3724   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3725     cmpptr(str1, str2);
3726     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3727     if (ae == StrIntrinsicNode::LL) {
3728       scale = Address::times_1;
3729       stride = 16;
3730     } else {
3731       scale = Address::times_2;
3732       stride = 8;
3733     }
3734   } else {
3735     scale1 = Address::times_1;
3736     scale2 = Address::times_2;
3737     // scale not used
3738     stride = 8;
3739   }
3740 
3741   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3742     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3743     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3744     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3745     Label COMPARE_TAIL_LONG;
3746     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only AVX3
3747 
3748     int pcmpmask = 0x19;
3749     if (ae == StrIntrinsicNode::LL) {
3750       pcmpmask &= ~0x01;
3751     }
3752 
3753     // Setup to compare 16-chars (32-bytes) vectors,
3754     // start from first character again because it has aligned address.
3755     if (ae == StrIntrinsicNode::LL) {
3756       stride2 = 32;
3757     } else {
3758       stride2 = 16;
3759     }
3760     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3761       adr_stride = stride << scale;
3762     } else {
3763       adr_stride1 = 8;  //stride << scale1;
3764       adr_stride2 = 16; //stride << scale2;
3765     }
3766 
3767     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3768     // rax and rdx are used by pcmpestri as elements counters
3769     movl(result, cnt2);
3770     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3771     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3772 
3773     // fast path : compare first 2 8-char vectors.
3774     bind(COMPARE_16_CHARS);
3775     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3776       movdqu(vec1, Address(str1, 0));
3777     } else {
3778       pmovzxbw(vec1, Address(str1, 0));
3779     }
3780     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3781     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3782 
3783     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3784       movdqu(vec1, Address(str1, adr_stride));
3785       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3786     } else {
3787       pmovzxbw(vec1, Address(str1, adr_stride1));
3788       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3789     }
3790     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3791     addl(cnt1, stride);
3792 
3793     // Compare the characters at index in cnt1
3794     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3795     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3796     subl(result, cnt2);
3797     jmp(POP_LABEL);
3798 
3799     // Setup the registers to start vector comparison loop
3800     bind(COMPARE_WIDE_VECTORS);
3801     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3802       lea(str1, Address(str1, result, scale));
3803       lea(str2, Address(str2, result, scale));
3804     } else {
3805       lea(str1, Address(str1, result, scale1));
3806       lea(str2, Address(str2, result, scale2));
3807     }
3808     subl(result, stride2);
3809     subl(cnt2, stride2);
3810     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3811     negptr(result);
3812 
3813     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3814     bind(COMPARE_WIDE_VECTORS_LOOP);
3815 
3816     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3817       cmpl(cnt2, stride2x2);
3818       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3819       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3820       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3821 
3822       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3823       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3824         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3825         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3826       } else {
3827         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3828         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3829       }
3830       kortestql(mask, mask);
3831       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3832       addptr(result, stride2x2);  // update since we already compared at this addr
3833       subl(cnt2, stride2x2);      // and sub the size too
3834       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3835 
3836       vpxor(vec1, vec1);
3837       jmpb(COMPARE_WIDE_TAIL);
3838     }//if (VM_Version::supports_avx512vlbw())
3839 
3840     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3841     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3842       vmovdqu(vec1, Address(str1, result, scale));
3843       vpxor(vec1, Address(str2, result, scale));
3844     } else {
3845       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3846       vpxor(vec1, Address(str2, result, scale2));
3847     }
3848     vptest(vec1, vec1);
3849     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3850     addptr(result, stride2);
3851     subl(cnt2, stride2);
3852     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3853     // clean upper bits of YMM registers
3854     vpxor(vec1, vec1);
3855 
3856     // compare wide vectors tail
3857     bind(COMPARE_WIDE_TAIL);
3858     testptr(result, result);
3859     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3860 
3861     movl(result, stride2);
3862     movl(cnt2, result);
3863     negptr(result);
3864     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3865 
3866     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3867     bind(VECTOR_NOT_EQUAL);
3868     // clean upper bits of YMM registers
3869     vpxor(vec1, vec1);
3870     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3871       lea(str1, Address(str1, result, scale));
3872       lea(str2, Address(str2, result, scale));
3873     } else {
3874       lea(str1, Address(str1, result, scale1));
3875       lea(str2, Address(str2, result, scale2));
3876     }
3877     jmp(COMPARE_16_CHARS);
3878 
3879     // Compare tail chars, length between 1 to 15 chars
3880     bind(COMPARE_TAIL_LONG);
3881     movl(cnt2, result);
3882     cmpl(cnt2, stride);
3883     jcc(Assembler::less, COMPARE_SMALL_STR);
3884 
3885     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3886       movdqu(vec1, Address(str1, 0));
3887     } else {
3888       pmovzxbw(vec1, Address(str1, 0));
3889     }
3890     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3891     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3892     subptr(cnt2, stride);
3893     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3894     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3895       lea(str1, Address(str1, result, scale));
3896       lea(str2, Address(str2, result, scale));
3897     } else {
3898       lea(str1, Address(str1, result, scale1));
3899       lea(str2, Address(str2, result, scale2));
3900     }
3901     negptr(cnt2);
3902     jmpb(WHILE_HEAD_LABEL);
3903 
3904     bind(COMPARE_SMALL_STR);
3905   } else if (UseSSE42Intrinsics) {
3906     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3907     int pcmpmask = 0x19;
3908     // Setup to compare 8-char (16-byte) vectors,
3909     // start from first character again because it has aligned address.
3910     movl(result, cnt2);
3911     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3912     if (ae == StrIntrinsicNode::LL) {
3913       pcmpmask &= ~0x01;
3914     }
3915     jcc(Assembler::zero, COMPARE_TAIL);
3916     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3917       lea(str1, Address(str1, result, scale));
3918       lea(str2, Address(str2, result, scale));
3919     } else {
3920       lea(str1, Address(str1, result, scale1));
3921       lea(str2, Address(str2, result, scale2));
3922     }
3923     negptr(result);
3924 
3925     // pcmpestri
3926     //   inputs:
3927     //     vec1- substring
3928     //     rax - negative string length (elements count)
3929     //     mem - scanned string
3930     //     rdx - string length (elements count)
3931     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3932     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3933     //   outputs:
3934     //     rcx - first mismatched element index
3935     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3936 
3937     bind(COMPARE_WIDE_VECTORS);
3938     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3939       movdqu(vec1, Address(str1, result, scale));
3940       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3941     } else {
3942       pmovzxbw(vec1, Address(str1, result, scale1));
3943       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3944     }
3945     // After pcmpestri cnt1(rcx) contains mismatched element index
3946 
3947     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3948     addptr(result, stride);
3949     subptr(cnt2, stride);
3950     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3951 
3952     // compare wide vectors tail
3953     testptr(result, result);
3954     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3955 
3956     movl(cnt2, stride);
3957     movl(result, stride);
3958     negptr(result);
3959     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3960       movdqu(vec1, Address(str1, result, scale));
3961       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3962     } else {
3963       pmovzxbw(vec1, Address(str1, result, scale1));
3964       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3965     }
3966     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3967 
3968     // Mismatched characters in the vectors
3969     bind(VECTOR_NOT_EQUAL);
3970     addptr(cnt1, result);
3971     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3972     subl(result, cnt2);
3973     jmpb(POP_LABEL);
3974 
3975     bind(COMPARE_TAIL); // limit is zero
3976     movl(cnt2, result);
3977     // Fallthru to tail compare
3978   }
3979   // Shift str2 and str1 to the end of the arrays, negate min
3980   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3981     lea(str1, Address(str1, cnt2, scale));
3982     lea(str2, Address(str2, cnt2, scale));
3983   } else {
3984     lea(str1, Address(str1, cnt2, scale1));
3985     lea(str2, Address(str2, cnt2, scale2));
3986   }
3987   decrementl(cnt2);  // first character was compared already
3988   negptr(cnt2);
3989 
3990   // Compare the rest of the elements
3991   bind(WHILE_HEAD_LABEL);
3992   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3993   subl(result, cnt1);
3994   jccb(Assembler::notZero, POP_LABEL);
3995   increment(cnt2);
3996   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3997 
3998   // Strings are equal up to min length.  Return the length difference.
3999   bind(LENGTH_DIFF_LABEL);
4000   pop(result);
4001   if (ae == StrIntrinsicNode::UU) {
4002     // Divide diff by 2 to get number of chars
4003     sarl(result, 1);
4004   }
4005   jmpb(DONE_LABEL);
4006 
4007   if (VM_Version::supports_avx512vlbw()) {
4008 
4009     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
4010 
4011     kmovql(cnt1, mask);
4012     notq(cnt1);
4013     bsfq(cnt2, cnt1);
4014     if (ae != StrIntrinsicNode::LL) {
4015       // Divide diff by 2 to get number of chars
4016       sarl(cnt2, 1);
4017     }
4018     addq(result, cnt2);
4019     if (ae == StrIntrinsicNode::LL) {
4020       load_unsigned_byte(cnt1, Address(str2, result));
4021       load_unsigned_byte(result, Address(str1, result));
4022     } else if (ae == StrIntrinsicNode::UU) {
4023       load_unsigned_short(cnt1, Address(str2, result, scale));
4024       load_unsigned_short(result, Address(str1, result, scale));
4025     } else {
4026       load_unsigned_short(cnt1, Address(str2, result, scale2));
4027       load_unsigned_byte(result, Address(str1, result, scale1));
4028     }
4029     subl(result, cnt1);
4030     jmpb(POP_LABEL);
4031   }//if (VM_Version::supports_avx512vlbw())
4032 
4033   // Discard the stored length difference
4034   bind(POP_LABEL);
4035   pop(cnt1);
4036 
4037   // That's it
4038   bind(DONE_LABEL);
4039   if(ae == StrIntrinsicNode::UL) {
4040     negl(result);
4041   }
4042 
4043 }
4044 
4045 // Search for Non-ASCII character (Negative byte value) in a byte array,
4046 // return the index of the first such character, otherwise the length
4047 // of the array segment searched.
4048 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4049 //   @IntrinsicCandidate
4050 //   public static int countPositives(byte[] ba, int off, int len) {
4051 //     for (int i = off; i < off + len; i++) {
4052 //       if (ba[i] < 0) {
4053 //         return i - off;
4054 //       }
4055 //     }
4056 //     return len;
4057 //   }
4058 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4059   Register result, Register tmp1,
4060   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4061   // rsi: byte array
4062   // rcx: len
4063   // rax: result
4064   ShortBranchVerifier sbv(this);
4065   assert_different_registers(ary1, len, result, tmp1);
4066   assert_different_registers(vec1, vec2);
4067   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4068 
4069   movl(result, len); // copy
4070   // len == 0
4071   testl(len, len);
4072   jcc(Assembler::zero, DONE);
4073 
4074   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4075     VM_Version::supports_avx512vlbw() &&
4076     VM_Version::supports_bmi2()) {
4077 
4078     Label test_64_loop, test_tail, BREAK_LOOP;
4079     movl(tmp1, len);
4080     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4081 
4082     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4083     andl(len,  0xffffffc0); // vector count (in chars)
4084     jccb(Assembler::zero, test_tail);
4085 
4086     lea(ary1, Address(ary1, len, Address::times_1));
4087     negptr(len);
4088 
4089     bind(test_64_loop);
4090     // Check whether our 64 elements of size byte contain negatives
4091     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4092     kortestql(mask1, mask1);
4093     jcc(Assembler::notZero, BREAK_LOOP);
4094 
4095     addptr(len, 64);
4096     jccb(Assembler::notZero, test_64_loop);
4097 
4098     bind(test_tail);
4099     // bail out when there is nothing to be done
4100     testl(tmp1, -1);
4101     jcc(Assembler::zero, DONE);
4102 
4103 
4104     // check the tail for absense of negatives
4105     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4106     {
4107       Register tmp3_aliased = len;
4108       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4109       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4110       notq(tmp3_aliased);
4111       kmovql(mask2, tmp3_aliased);
4112     }
4113 
4114     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4115     ktestq(mask1, mask2);
4116     jcc(Assembler::zero, DONE);
4117 
4118     // do a full check for negative registers in the tail
4119     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4120                      // ary1 already pointing to the right place
4121     jmpb(TAIL_START);
4122 
4123     bind(BREAK_LOOP);
4124     // At least one byte in the last 64 byte block was negative.
4125     // Set up to look at the last 64 bytes as if they were a tail
4126     lea(ary1, Address(ary1, len, Address::times_1));
4127     addptr(result, len);
4128     // Ignore the very last byte: if all others are positive,
4129     // it must be negative, so we can skip right to the 2+1 byte
4130     // end comparison at this point
4131     orl(result, 63);
4132     movl(len, 63);
4133     // Fallthru to tail compare
4134   } else {
4135 
4136     if (UseAVX >= 2) {
4137       // With AVX2, use 32-byte vector compare
4138       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4139 
4140       // Compare 32-byte vectors
4141       testl(len, 0xffffffe0);   // vector count (in bytes)
4142       jccb(Assembler::zero, TAIL_START);
4143 
4144       andl(len, 0xffffffe0);
4145       lea(ary1, Address(ary1, len, Address::times_1));
4146       negptr(len);
4147 
4148       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4149       movdl(vec2, tmp1);
4150       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4151 
4152       bind(COMPARE_WIDE_VECTORS);
4153       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4154       vptest(vec1, vec2);
4155       jccb(Assembler::notZero, BREAK_LOOP);
4156       addptr(len, 32);
4157       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4158 
4159       testl(result, 0x0000001f);   // any bytes remaining?
4160       jcc(Assembler::zero, DONE);
4161 
4162       // Quick test using the already prepared vector mask
4163       movl(len, result);
4164       andl(len, 0x0000001f);
4165       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4166       vptest(vec1, vec2);
4167       jcc(Assembler::zero, DONE);
4168       // There are zeros, jump to the tail to determine exactly where
4169       jmpb(TAIL_START);
4170 
4171       bind(BREAK_LOOP);
4172       // At least one byte in the last 32-byte vector is negative.
4173       // Set up to look at the last 32 bytes as if they were a tail
4174       lea(ary1, Address(ary1, len, Address::times_1));
4175       addptr(result, len);
4176       // Ignore the very last byte: if all others are positive,
4177       // it must be negative, so we can skip right to the 2+1 byte
4178       // end comparison at this point
4179       orl(result, 31);
4180       movl(len, 31);
4181       // Fallthru to tail compare
4182     } else if (UseSSE42Intrinsics) {
4183       // With SSE4.2, use double quad vector compare
4184       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4185 
4186       // Compare 16-byte vectors
4187       testl(len, 0xfffffff0);   // vector count (in bytes)
4188       jcc(Assembler::zero, TAIL_START);
4189 
4190       andl(len, 0xfffffff0);
4191       lea(ary1, Address(ary1, len, Address::times_1));
4192       negptr(len);
4193 
4194       movl(tmp1, 0x80808080);
4195       movdl(vec2, tmp1);
4196       pshufd(vec2, vec2, 0);
4197 
4198       bind(COMPARE_WIDE_VECTORS);
4199       movdqu(vec1, Address(ary1, len, Address::times_1));
4200       ptest(vec1, vec2);
4201       jccb(Assembler::notZero, BREAK_LOOP);
4202       addptr(len, 16);
4203       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4204 
4205       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4206       jcc(Assembler::zero, DONE);
4207 
4208       // Quick test using the already prepared vector mask
4209       movl(len, result);
4210       andl(len, 0x0000000f);   // tail count (in bytes)
4211       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4212       ptest(vec1, vec2);
4213       jcc(Assembler::zero, DONE);
4214       jmpb(TAIL_START);
4215 
4216       bind(BREAK_LOOP);
4217       // At least one byte in the last 16-byte vector is negative.
4218       // Set up and look at the last 16 bytes as if they were a tail
4219       lea(ary1, Address(ary1, len, Address::times_1));
4220       addptr(result, len);
4221       // Ignore the very last byte: if all others are positive,
4222       // it must be negative, so we can skip right to the 2+1 byte
4223       // end comparison at this point
4224       orl(result, 15);
4225       movl(len, 15);
4226       // Fallthru to tail compare
4227     }
4228   }
4229 
4230   bind(TAIL_START);
4231   // Compare 4-byte vectors
4232   andl(len, 0xfffffffc); // vector count (in bytes)
4233   jccb(Assembler::zero, COMPARE_CHAR);
4234 
4235   lea(ary1, Address(ary1, len, Address::times_1));
4236   negptr(len);
4237 
4238   bind(COMPARE_VECTORS);
4239   movl(tmp1, Address(ary1, len, Address::times_1));
4240   andl(tmp1, 0x80808080);
4241   jccb(Assembler::notZero, TAIL_ADJUST);
4242   addptr(len, 4);
4243   jccb(Assembler::notZero, COMPARE_VECTORS);
4244 
4245   // Compare trailing char (final 2-3 bytes), if any
4246   bind(COMPARE_CHAR);
4247 
4248   testl(result, 0x2);   // tail  char
4249   jccb(Assembler::zero, COMPARE_BYTE);
4250   load_unsigned_short(tmp1, Address(ary1, 0));
4251   andl(tmp1, 0x00008080);
4252   jccb(Assembler::notZero, CHAR_ADJUST);
4253   lea(ary1, Address(ary1, 2));
4254 
4255   bind(COMPARE_BYTE);
4256   testl(result, 0x1);   // tail  byte
4257   jccb(Assembler::zero, DONE);
4258   load_unsigned_byte(tmp1, Address(ary1, 0));
4259   testl(tmp1, 0x00000080);
4260   jccb(Assembler::zero, DONE);
4261   subptr(result, 1);
4262   jmpb(DONE);
4263 
4264   bind(TAIL_ADJUST);
4265   // there are negative bits in the last 4 byte block.
4266   // Adjust result and check the next three bytes
4267   addptr(result, len);
4268   orl(result, 3);
4269   lea(ary1, Address(ary1, len, Address::times_1));
4270   jmpb(COMPARE_CHAR);
4271 
4272   bind(CHAR_ADJUST);
4273   // We are looking at a char + optional byte tail, and found that one
4274   // of the bytes in the char is negative. Adjust the result, check the
4275   // first byte and readjust if needed.
4276   andl(result, 0xfffffffc);
4277   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4278   jccb(Assembler::notZero, DONE);
4279   addptr(result, 1);
4280 
4281   // That's it
4282   bind(DONE);
4283   if (UseAVX >= 2) {
4284     // clean upper bits of YMM registers
4285     vpxor(vec1, vec1);
4286     vpxor(vec2, vec2);
4287   }
4288 }
4289 
4290 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4291 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4292                                       Register limit, Register result, Register chr,
4293                                       XMMRegister vec1, XMMRegister vec2, bool is_char,
4294                                       KRegister mask, bool expand_ary2) {
4295   // for expand_ary2, limit is the (smaller) size of the second array.
4296   ShortBranchVerifier sbv(this);
4297   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4298 
4299   assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4300          "Expansion only implemented for AVX2");
4301 
4302   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4303   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4304 
4305   Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4306   int scaleIncr = expand_ary2 ? 8 : 16;
4307 
4308   if (is_array_equ) {
4309     // Check the input args
4310     cmpoop(ary1, ary2);
4311     jcc(Assembler::equal, TRUE_LABEL);
4312 
4313     // Need additional checks for arrays_equals.
4314     testptr(ary1, ary1);
4315     jcc(Assembler::zero, FALSE_LABEL);
4316     testptr(ary2, ary2);
4317     jcc(Assembler::zero, FALSE_LABEL);
4318 
4319     // Check the lengths
4320     movl(limit, Address(ary1, length_offset));
4321     cmpl(limit, Address(ary2, length_offset));
4322     jcc(Assembler::notEqual, FALSE_LABEL);
4323   }
4324 
4325   // count == 0
4326   testl(limit, limit);
4327   jcc(Assembler::zero, TRUE_LABEL);
4328 
4329   if (is_array_equ) {
4330     // Load array address
4331     lea(ary1, Address(ary1, base_offset));
4332     lea(ary2, Address(ary2, base_offset));
4333   }
4334 
4335   if (is_array_equ && is_char) {
4336     // arrays_equals when used for char[].
4337     shll(limit, 1);      // byte count != 0
4338   }
4339   movl(result, limit); // copy
4340 
4341   if (UseAVX >= 2) {
4342     // With AVX2, use 32-byte vector compare
4343     Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4344 
4345     // Compare 32-byte vectors
4346     if (expand_ary2) {
4347       andl(result, 0x0000000f);  //   tail count (in bytes)
4348       andl(limit, 0xfffffff0);   // vector count (in bytes)
4349       jcc(Assembler::zero, COMPARE_TAIL);
4350     } else {
4351       andl(result, 0x0000001f);  //   tail count (in bytes)
4352       andl(limit, 0xffffffe0);   // vector count (in bytes)
4353       jcc(Assembler::zero, COMPARE_TAIL_16);
4354     }
4355 
4356     lea(ary1, Address(ary1, limit, scaleFactor));
4357     lea(ary2, Address(ary2, limit, Address::times_1));
4358     negptr(limit);
4359 
4360     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4361       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4362 
4363       cmpl(limit, -64);
4364       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4365 
4366       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4367 
4368       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4369       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4370       kortestql(mask, mask);
4371       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4372       addptr(limit, 64);  // update since we already compared at this addr
4373       cmpl(limit, -64);
4374       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4375 
4376       // At this point we may still need to compare -limit+result bytes.
4377       // We could execute the next two instruction and just continue via non-wide path:
4378       //  cmpl(limit, 0);
4379       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4380       // But since we stopped at the points ary{1,2}+limit which are
4381       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4382       // (|limit| <= 32 and result < 32),
4383       // we may just compare the last 64 bytes.
4384       //
4385       addptr(result, -64);   // it is safe, bc we just came from this area
4386       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4387       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4388       kortestql(mask, mask);
4389       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4390 
4391       jmp(TRUE_LABEL);
4392 
4393       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4394 
4395     }//if (VM_Version::supports_avx512vlbw())
4396 
4397     bind(COMPARE_WIDE_VECTORS);
4398     vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4399     if (expand_ary2) {
4400       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4401     } else {
4402       vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4403     }
4404     vpxor(vec1, vec2);
4405 
4406     vptest(vec1, vec1);
4407     jcc(Assembler::notZero, FALSE_LABEL);
4408     addptr(limit, scaleIncr * 2);
4409     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4410 
4411     testl(result, result);
4412     jcc(Assembler::zero, TRUE_LABEL);
4413 
4414     vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4415     if (expand_ary2) {
4416       vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4417     } else {
4418       vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4419     }
4420     vpxor(vec1, vec2);
4421 
4422     vptest(vec1, vec1);
4423     jcc(Assembler::notZero, FALSE_LABEL);
4424     jmp(TRUE_LABEL);
4425 
4426     bind(COMPARE_TAIL_16); // limit is zero
4427     movl(limit, result);
4428 
4429     // Compare 16-byte chunks
4430     andl(result, 0x0000000f);  //   tail count (in bytes)
4431     andl(limit, 0xfffffff0);   // vector count (in bytes)
4432     jcc(Assembler::zero, COMPARE_TAIL);
4433 
4434     lea(ary1, Address(ary1, limit, scaleFactor));
4435     lea(ary2, Address(ary2, limit, Address::times_1));
4436     negptr(limit);
4437 
4438     bind(COMPARE_WIDE_VECTORS_16);
4439     movdqu(vec1, Address(ary1, limit, scaleFactor));
4440     if (expand_ary2) {
4441       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4442     } else {
4443       movdqu(vec2, Address(ary2, limit, Address::times_1));
4444     }
4445     pxor(vec1, vec2);
4446 
4447     ptest(vec1, vec1);
4448     jcc(Assembler::notZero, FALSE_LABEL);
4449     addptr(limit, scaleIncr);
4450     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4451 
4452     bind(COMPARE_TAIL); // limit is zero
4453     movl(limit, result);
4454     // Fallthru to tail compare
4455   } else if (UseSSE42Intrinsics) {
4456     // With SSE4.2, use double quad vector compare
4457     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4458 
4459     // Compare 16-byte vectors
4460     andl(result, 0x0000000f);  //   tail count (in bytes)
4461     andl(limit, 0xfffffff0);   // vector count (in bytes)
4462     jcc(Assembler::zero, COMPARE_TAIL);
4463 
4464     lea(ary1, Address(ary1, limit, Address::times_1));
4465     lea(ary2, Address(ary2, limit, Address::times_1));
4466     negptr(limit);
4467 
4468     bind(COMPARE_WIDE_VECTORS);
4469     movdqu(vec1, Address(ary1, limit, Address::times_1));
4470     movdqu(vec2, Address(ary2, limit, Address::times_1));
4471     pxor(vec1, vec2);
4472 
4473     ptest(vec1, vec1);
4474     jcc(Assembler::notZero, FALSE_LABEL);
4475     addptr(limit, 16);
4476     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4477 
4478     testl(result, result);
4479     jcc(Assembler::zero, TRUE_LABEL);
4480 
4481     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4482     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4483     pxor(vec1, vec2);
4484 
4485     ptest(vec1, vec1);
4486     jccb(Assembler::notZero, FALSE_LABEL);
4487     jmpb(TRUE_LABEL);
4488 
4489     bind(COMPARE_TAIL); // limit is zero
4490     movl(limit, result);
4491     // Fallthru to tail compare
4492   }
4493 
4494   // Compare 4-byte vectors
4495   if (expand_ary2) {
4496     testl(result, result);
4497     jccb(Assembler::zero, TRUE_LABEL);
4498   } else {
4499     andl(limit, 0xfffffffc); // vector count (in bytes)
4500     jccb(Assembler::zero, COMPARE_CHAR);
4501   }
4502 
4503   lea(ary1, Address(ary1, limit, scaleFactor));
4504   lea(ary2, Address(ary2, limit, Address::times_1));
4505   negptr(limit);
4506 
4507   bind(COMPARE_VECTORS);
4508   if (expand_ary2) {
4509     // There are no "vector" operations for bytes to shorts
4510     movzbl(chr, Address(ary2, limit, Address::times_1));
4511     cmpw(Address(ary1, limit, Address::times_2), chr);
4512     jccb(Assembler::notEqual, FALSE_LABEL);
4513     addptr(limit, 1);
4514     jcc(Assembler::notZero, COMPARE_VECTORS);
4515     jmp(TRUE_LABEL);
4516   } else {
4517     movl(chr, Address(ary1, limit, Address::times_1));
4518     cmpl(chr, Address(ary2, limit, Address::times_1));
4519     jccb(Assembler::notEqual, FALSE_LABEL);
4520     addptr(limit, 4);
4521     jcc(Assembler::notZero, COMPARE_VECTORS);
4522   }
4523 
4524   // Compare trailing char (final 2 bytes), if any
4525   bind(COMPARE_CHAR);
4526   testl(result, 0x2);   // tail  char
4527   jccb(Assembler::zero, COMPARE_BYTE);
4528   load_unsigned_short(chr, Address(ary1, 0));
4529   load_unsigned_short(limit, Address(ary2, 0));
4530   cmpl(chr, limit);
4531   jccb(Assembler::notEqual, FALSE_LABEL);
4532 
4533   if (is_array_equ && is_char) {
4534     bind(COMPARE_BYTE);
4535   } else {
4536     lea(ary1, Address(ary1, 2));
4537     lea(ary2, Address(ary2, 2));
4538 
4539     bind(COMPARE_BYTE);
4540     testl(result, 0x1);   // tail  byte
4541     jccb(Assembler::zero, TRUE_LABEL);
4542     load_unsigned_byte(chr, Address(ary1, 0));
4543     load_unsigned_byte(limit, Address(ary2, 0));
4544     cmpl(chr, limit);
4545     jccb(Assembler::notEqual, FALSE_LABEL);
4546   }
4547   bind(TRUE_LABEL);
4548   movl(result, 1);   // return true
4549   jmpb(DONE);
4550 
4551   bind(FALSE_LABEL);
4552   xorl(result, result); // return false
4553 
4554   // That's it
4555   bind(DONE);
4556   if (UseAVX >= 2) {
4557     // clean upper bits of YMM registers
4558     vpxor(vec1, vec1);
4559     vpxor(vec2, vec2);
4560   }
4561 }
4562 
4563 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4564 #define __ masm.
4565   Register dst = stub.data<0>();
4566   XMMRegister src = stub.data<1>();
4567   address target = stub.data<2>();
4568   __ bind(stub.entry());
4569   __ subptr(rsp, 8);
4570   __ movdbl(Address(rsp), src);
4571   __ call(RuntimeAddress(target));
4572   // APX REX2 encoding for pop(dst) increases the stub size by 1 byte.
4573   __ pop(dst);
4574   __ jmp(stub.continuation());
4575 #undef __
4576 }
4577 
4578 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4579   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4580   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4581 
4582   address slowpath_target;
4583   if (dst_bt == T_INT) {
4584     if (src_bt == T_FLOAT) {
4585       cvttss2sil(dst, src);
4586       cmpl(dst, 0x80000000);
4587       slowpath_target = StubRoutines::x86::f2i_fixup();
4588     } else {
4589       cvttsd2sil(dst, src);
4590       cmpl(dst, 0x80000000);
4591       slowpath_target = StubRoutines::x86::d2i_fixup();
4592     }
4593   } else {
4594     if (src_bt == T_FLOAT) {
4595       cvttss2siq(dst, src);
4596       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4597       slowpath_target = StubRoutines::x86::f2l_fixup();
4598     } else {
4599       cvttsd2siq(dst, src);
4600       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4601       slowpath_target = StubRoutines::x86::d2l_fixup();
4602     }
4603   }
4604 
4605   // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte.
4606   int max_size = 23 + (UseAPX ? 1 : 0);
4607   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath);
4608   jcc(Assembler::equal, stub->entry());
4609   bind(stub->continuation());
4610 }
4611 
4612 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4613                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4614   switch(ideal_opc) {
4615     case Op_LShiftVS:
4616       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4617     case Op_LShiftVI:
4618       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4619     case Op_LShiftVL:
4620       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4621     case Op_RShiftVS:
4622       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4623     case Op_RShiftVI:
4624       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4625     case Op_RShiftVL:
4626       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4627     case Op_URShiftVS:
4628       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4629     case Op_URShiftVI:
4630       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4631     case Op_URShiftVL:
4632       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4633     case Op_RotateRightV:
4634       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4635     case Op_RotateLeftV:
4636       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4637     default:
4638       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4639       break;
4640   }
4641 }
4642 
4643 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4644                                                XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4645   if (is_unsigned) {
4646     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4647   } else {
4648     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4649   }
4650 }
4651 
4652 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4653                                                       XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4654   switch (elem_bt) {
4655     case T_BYTE:
4656       if (ideal_opc == Op_SaturatingAddV) {
4657         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4658       } else {
4659         assert(ideal_opc == Op_SaturatingSubV, "");
4660         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4661       }
4662       break;
4663     case T_SHORT:
4664       if (ideal_opc == Op_SaturatingAddV) {
4665         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4666       } else {
4667         assert(ideal_opc == Op_SaturatingSubV, "");
4668         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4669       }
4670       break;
4671     default:
4672       fatal("Unsupported type %s", type2name(elem_bt));
4673       break;
4674   }
4675 }
4676 
4677 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4678                                                         XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4679   switch (elem_bt) {
4680     case T_BYTE:
4681       if (ideal_opc == Op_SaturatingAddV) {
4682         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4683       } else {
4684         assert(ideal_opc == Op_SaturatingSubV, "");
4685         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4686       }
4687       break;
4688     case T_SHORT:
4689       if (ideal_opc == Op_SaturatingAddV) {
4690         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4691       } else {
4692         assert(ideal_opc == Op_SaturatingSubV, "");
4693         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4694       }
4695       break;
4696     default:
4697       fatal("Unsupported type %s", type2name(elem_bt));
4698       break;
4699   }
4700 }
4701 
4702 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4703                                                Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4704   if (is_unsigned) {
4705     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4706   } else {
4707     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4708   }
4709 }
4710 
4711 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4712                                                       XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4713   switch (elem_bt) {
4714     case T_BYTE:
4715       if (ideal_opc == Op_SaturatingAddV) {
4716         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4717       } else {
4718         assert(ideal_opc == Op_SaturatingSubV, "");
4719         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4720       }
4721       break;
4722     case T_SHORT:
4723       if (ideal_opc == Op_SaturatingAddV) {
4724         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4725       } else {
4726         assert(ideal_opc == Op_SaturatingSubV, "");
4727         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4728       }
4729       break;
4730     default:
4731       fatal("Unsupported type %s", type2name(elem_bt));
4732       break;
4733   }
4734 }
4735 
4736 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4737                                                         XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4738   switch (elem_bt) {
4739     case T_BYTE:
4740       if (ideal_opc == Op_SaturatingAddV) {
4741         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4742       } else {
4743         assert(ideal_opc == Op_SaturatingSubV, "");
4744         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4745       }
4746       break;
4747     case T_SHORT:
4748       if (ideal_opc == Op_SaturatingAddV) {
4749         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4750       } else {
4751         assert(ideal_opc == Op_SaturatingSubV, "");
4752         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4753       }
4754       break;
4755     default:
4756       fatal("Unsupported type %s", type2name(elem_bt));
4757       break;
4758   }
4759 }
4760 
4761 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4762                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4763                                     bool is_varshift) {
4764   switch (ideal_opc) {
4765     case Op_AddVB:
4766       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4767     case Op_AddVS:
4768       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4769     case Op_AddVI:
4770       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4771     case Op_AddVL:
4772       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4773     case Op_AddVF:
4774       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4775     case Op_AddVD:
4776       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4777     case Op_SubVB:
4778       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4779     case Op_SubVS:
4780       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4781     case Op_SubVI:
4782       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4783     case Op_SubVL:
4784       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4785     case Op_SubVF:
4786       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4787     case Op_SubVD:
4788       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4789     case Op_MulVS:
4790       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4791     case Op_MulVI:
4792       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4793     case Op_MulVL:
4794       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4795     case Op_MulVF:
4796       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4797     case Op_MulVD:
4798       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4799     case Op_DivVF:
4800       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4801     case Op_DivVD:
4802       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4803     case Op_SqrtVF:
4804       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4805     case Op_SqrtVD:
4806       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4807     case Op_AbsVB:
4808       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4809     case Op_AbsVS:
4810       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4811     case Op_AbsVI:
4812       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4813     case Op_AbsVL:
4814       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4815     case Op_FmaVF:
4816       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4817     case Op_FmaVD:
4818       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4819     case Op_VectorRearrange:
4820       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4821     case Op_LShiftVS:
4822       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4823     case Op_LShiftVI:
4824       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4825     case Op_LShiftVL:
4826       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4827     case Op_RShiftVS:
4828       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4829     case Op_RShiftVI:
4830       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4831     case Op_RShiftVL:
4832       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4833     case Op_URShiftVS:
4834       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4835     case Op_URShiftVI:
4836       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4837     case Op_URShiftVL:
4838       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4839     case Op_RotateLeftV:
4840       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4841     case Op_RotateRightV:
4842       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4843     case Op_MaxV:
4844       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4845     case Op_MinV:
4846       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4847     case Op_UMinV:
4848       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4849     case Op_UMaxV:
4850       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4851     case Op_XorV:
4852       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4853     case Op_OrV:
4854       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4855     case Op_AndV:
4856       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4857     default:
4858       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4859       break;
4860   }
4861 }
4862 
4863 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4864                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4865   switch (ideal_opc) {
4866     case Op_AddVB:
4867       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4868     case Op_AddVS:
4869       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4870     case Op_AddVI:
4871       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4872     case Op_AddVL:
4873       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4874     case Op_AddVF:
4875       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4876     case Op_AddVD:
4877       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4878     case Op_SubVB:
4879       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4880     case Op_SubVS:
4881       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4882     case Op_SubVI:
4883       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4884     case Op_SubVL:
4885       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4886     case Op_SubVF:
4887       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4888     case Op_SubVD:
4889       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4890     case Op_MulVS:
4891       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4892     case Op_MulVI:
4893       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4894     case Op_MulVL:
4895       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4896     case Op_MulVF:
4897       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4898     case Op_MulVD:
4899       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4900     case Op_DivVF:
4901       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4902     case Op_DivVD:
4903       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4904     case Op_FmaVF:
4905       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4906     case Op_FmaVD:
4907       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4908     case Op_MaxV:
4909       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4910     case Op_MinV:
4911       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4912     case Op_UMaxV:
4913       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4914     case Op_UMinV:
4915       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4916     case Op_XorV:
4917       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4918     case Op_OrV:
4919       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4920     case Op_AndV:
4921       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4922     default:
4923       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4924       break;
4925   }
4926 }
4927 
4928 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4929                                   KRegister src1, KRegister src2) {
4930   BasicType etype = T_ILLEGAL;
4931   switch(mask_len) {
4932     case 2:
4933     case 4:
4934     case 8:  etype = T_BYTE; break;
4935     case 16: etype = T_SHORT; break;
4936     case 32: etype = T_INT; break;
4937     case 64: etype = T_LONG; break;
4938     default: fatal("Unsupported type"); break;
4939   }
4940   assert(etype != T_ILLEGAL, "");
4941   switch(ideal_opc) {
4942     case Op_AndVMask:
4943       kand(etype, dst, src1, src2); break;
4944     case Op_OrVMask:
4945       kor(etype, dst, src1, src2); break;
4946     case Op_XorVMask:
4947       kxor(etype, dst, src1, src2); break;
4948     default:
4949       fatal("Unsupported masked operation"); break;
4950   }
4951 }
4952 
4953 /*
4954  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4955  * If src is NaN, the result is 0.
4956  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4957  * the result is equal to the value of Integer.MIN_VALUE.
4958  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4959  * the result is equal to the value of Integer.MAX_VALUE.
4960  */
4961 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4962                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4963                                                                    Register rscratch, AddressLiteral float_sign_flip,
4964                                                                    int vec_enc) {
4965   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4966   Label done;
4967   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4968   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4969   vptest(xtmp2, xtmp2, vec_enc);
4970   jccb(Assembler::equal, done);
4971 
4972   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4973   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4974 
4975   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4976   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4977   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4978 
4979   // Recompute the mask for remaining special value.
4980   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4981   // Extract SRC values corresponding to TRUE mask lanes.
4982   vpand(xtmp4, xtmp2, src, vec_enc);
4983   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4984   // values are set.
4985   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4986 
4987   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4988   bind(done);
4989 }
4990 
4991 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4992                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4993                                                                     Register rscratch, AddressLiteral float_sign_flip,
4994                                                                     int vec_enc) {
4995   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4996   Label done;
4997   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4998   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4999   kortestwl(ktmp1, ktmp1);
5000   jccb(Assembler::equal, done);
5001 
5002   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5003   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5004   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5005 
5006   kxorwl(ktmp1, ktmp1, ktmp2);
5007   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5008   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5009   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5010   bind(done);
5011 }
5012 
5013 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5014                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5015                                                                      Register rscratch, AddressLiteral double_sign_flip,
5016                                                                      int vec_enc) {
5017   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5018 
5019   Label done;
5020   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5021   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
5022   kortestwl(ktmp1, ktmp1);
5023   jccb(Assembler::equal, done);
5024 
5025   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5026   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5027   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5028 
5029   kxorwl(ktmp1, ktmp1, ktmp2);
5030   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5031   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5032   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5033   bind(done);
5034 }
5035 
5036 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5037                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5038                                                                      Register rscratch, AddressLiteral float_sign_flip,
5039                                                                      int vec_enc) {
5040   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5041   Label done;
5042   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5043   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5044   kortestwl(ktmp1, ktmp1);
5045   jccb(Assembler::equal, done);
5046 
5047   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5048   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5049   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5050 
5051   kxorwl(ktmp1, ktmp1, ktmp2);
5052   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5053   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5054   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5055   bind(done);
5056 }
5057 
5058 /*
5059  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5060  * If src is NaN, the result is 0.
5061  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
5062  * the result is equal to the value of Long.MIN_VALUE.
5063  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
5064  * the result is equal to the value of Long.MAX_VALUE.
5065  */
5066 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5067                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5068                                                                       Register rscratch, AddressLiteral double_sign_flip,
5069                                                                       int vec_enc) {
5070   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5071 
5072   Label done;
5073   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5074   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
5075   kortestwl(ktmp1, ktmp1);
5076   jccb(Assembler::equal, done);
5077 
5078   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5079   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5080   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5081 
5082   kxorwl(ktmp1, ktmp1, ktmp2);
5083   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5084   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5085   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5086   bind(done);
5087 }
5088 
5089 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5090                                                              XMMRegister xtmp, int index, int vec_enc) {
5091    assert(vec_enc < Assembler::AVX_512bit, "");
5092    if (vec_enc == Assembler::AVX_256bit) {
5093      vextractf128_high(xtmp, src);
5094      vshufps(dst, src, xtmp, index, vec_enc);
5095    } else {
5096      vshufps(dst, src, zero, index, vec_enc);
5097    }
5098 }
5099 
5100 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5101                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5102                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
5103   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5104 
5105   Label done;
5106   // Compare the destination lanes with float_sign_flip
5107   // value to get mask for all special values.
5108   movdqu(xtmp1, float_sign_flip, rscratch);
5109   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5110   ptest(xtmp2, xtmp2);
5111   jccb(Assembler::equal, done);
5112 
5113   // Flip float_sign_flip to get max integer value.
5114   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5115   pxor(xtmp1, xtmp4);
5116 
5117   // Set detination lanes corresponding to unordered source lanes as zero.
5118   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5119   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5120 
5121   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5122   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5123   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5124 
5125   // Recompute the mask for remaining special value.
5126   pxor(xtmp2, xtmp3);
5127   // Extract mask corresponding to non-negative source lanes.
5128   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5129 
5130   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5131   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5132   pand(xtmp3, xtmp2);
5133 
5134   // Replace destination lanes holding special value(0x80000000) with max int
5135   // if corresponding source lane holds a +ve value.
5136   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5137   bind(done);
5138 }
5139 
5140 
5141 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5142                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5143   switch(to_elem_bt) {
5144     case T_SHORT:
5145       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5146       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5147       vpackusdw(dst, dst, zero, vec_enc);
5148       if (vec_enc == Assembler::AVX_256bit) {
5149         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5150       }
5151       break;
5152     case  T_BYTE:
5153       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5154       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5155       vpackusdw(dst, dst, zero, vec_enc);
5156       if (vec_enc == Assembler::AVX_256bit) {
5157         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5158       }
5159       vpackuswb(dst, dst, zero, vec_enc);
5160       break;
5161     default: assert(false, "Unexpected basic type for target of vector cast int to subword: %s", type2name(to_elem_bt));
5162   }
5163 }
5164 
5165 /*
5166  * Algorithm for vector D2L and F2I conversions (AVX 10.2 unsupported):-
5167  * a) Perform vector D2L/F2I cast.
5168  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5169  *    It signifies that source value could be any of the special floating point
5170  *    values(NaN,-Inf,Inf,Max,-Min).
5171  * c) Set destination to zero if source is NaN value.
5172  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5173  */
5174 
5175 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5176                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5177                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5178   int to_elem_sz = type2aelembytes(to_elem_bt);
5179   assert(to_elem_sz <= 4, "");
5180   vcvttps2dq(dst, src, vec_enc);
5181   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5182   if (to_elem_sz < 4) {
5183     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5184     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5185   }
5186 }
5187 
5188 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5189                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5190                                             Register rscratch, int vec_enc) {
5191   int to_elem_sz = type2aelembytes(to_elem_bt);
5192   assert(to_elem_sz <= 4, "");
5193   vcvttps2dq(dst, src, vec_enc);
5194   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5195   switch(to_elem_bt) {
5196     case T_INT:
5197       break;
5198     case T_SHORT:
5199       evpmovdw(dst, dst, vec_enc);
5200       break;
5201     case T_BYTE:
5202       evpmovdb(dst, dst, vec_enc);
5203       break;
5204     default: assert(false, "Unexpected basic type for target of vector castF2X EVEX: %s", type2name(to_elem_bt));
5205   }
5206 }
5207 
5208 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5209                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5210                                             Register rscratch, int vec_enc) {
5211   evcvttps2qq(dst, src, vec_enc);
5212   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5213 }
5214 
5215 // Handling for downcasting from double to integer or sub-word types on AVX2.
5216 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5217                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5218                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5219   int to_elem_sz = type2aelembytes(to_elem_bt);
5220   assert(to_elem_sz < 8, "");
5221   vcvttpd2dq(dst, src, vec_enc);
5222   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5223                                               float_sign_flip, vec_enc);
5224   if (to_elem_sz < 4) {
5225     // xtmp4 holds all zero lanes.
5226     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5227   }
5228 }
5229 
5230 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5231                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5232                                             KRegister ktmp2, AddressLiteral sign_flip,
5233                                             Register rscratch, int vec_enc) {
5234   if (VM_Version::supports_avx512dq()) {
5235     evcvttpd2qq(dst, src, vec_enc);
5236     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5237     switch(to_elem_bt) {
5238       case T_LONG:
5239         break;
5240       case T_INT:
5241         evpmovsqd(dst, dst, vec_enc);
5242         break;
5243       case T_SHORT:
5244         evpmovsqd(dst, dst, vec_enc);
5245         evpmovdw(dst, dst, vec_enc);
5246         break;
5247       case T_BYTE:
5248         evpmovsqd(dst, dst, vec_enc);
5249         evpmovdb(dst, dst, vec_enc);
5250         break;
5251       default: assert(false, "Unexpected basic type for target of vector castD2X AVX512DQ EVEX: %s", type2name(to_elem_bt));
5252     }
5253   } else {
5254     assert(type2aelembytes(to_elem_bt) <= 4, "");
5255     vcvttpd2dq(dst, src, vec_enc);
5256     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5257     switch(to_elem_bt) {
5258       case T_INT:
5259         break;
5260       case T_SHORT:
5261         evpmovdw(dst, dst, vec_enc);
5262         break;
5263       case T_BYTE:
5264         evpmovdb(dst, dst, vec_enc);
5265         break;
5266       default: assert(false, "Unexpected basic type for target of vector castD2X EVEX: %s", type2name(to_elem_bt));
5267     }
5268   }
5269 }
5270 
5271 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5272   switch(to_elem_bt) {
5273     case T_LONG:
5274       evcvttps2qqs(dst, src, vec_enc);
5275       break;
5276     case T_INT:
5277       evcvttps2dqs(dst, src, vec_enc);
5278       break;
5279     case T_SHORT:
5280       evcvttps2dqs(dst, src, vec_enc);
5281       evpmovdw(dst, dst, vec_enc);
5282       break;
5283     case T_BYTE:
5284       evcvttps2dqs(dst, src, vec_enc);
5285       evpmovdb(dst, dst, vec_enc);
5286       break;
5287     default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (reg src): %s", type2name(to_elem_bt));
5288   }
5289 }
5290 
5291 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5292   switch(to_elem_bt) {
5293     case T_LONG:
5294       evcvttps2qqs(dst, src, vec_enc);
5295       break;
5296     case T_INT:
5297       evcvttps2dqs(dst, src, vec_enc);
5298       break;
5299     case T_SHORT:
5300       evcvttps2dqs(dst, src, vec_enc);
5301       evpmovdw(dst, dst, vec_enc);
5302       break;
5303     case T_BYTE:
5304       evcvttps2dqs(dst, src, vec_enc);
5305       evpmovdb(dst, dst, vec_enc);
5306       break;
5307     default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (mem src): %s", type2name(to_elem_bt));
5308   }
5309 }
5310 
5311 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5312   switch(to_elem_bt) {
5313     case T_LONG:
5314       evcvttpd2qqs(dst, src, vec_enc);
5315       break;
5316     case T_INT:
5317       evcvttpd2dqs(dst, src, vec_enc);
5318       break;
5319     case T_SHORT:
5320       evcvttpd2dqs(dst, src, vec_enc);
5321       evpmovdw(dst, dst, vec_enc);
5322       break;
5323     case T_BYTE:
5324       evcvttpd2dqs(dst, src, vec_enc);
5325       evpmovdb(dst, dst, vec_enc);
5326       break;
5327     default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (reg src): %s", type2name(to_elem_bt));
5328   }
5329 }
5330 
5331 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5332   switch(to_elem_bt) {
5333     case T_LONG:
5334       evcvttpd2qqs(dst, src, vec_enc);
5335       break;
5336     case T_INT:
5337       evcvttpd2dqs(dst, src, vec_enc);
5338       break;
5339     case T_SHORT:
5340       evcvttpd2dqs(dst, src, vec_enc);
5341       evpmovdw(dst, dst, vec_enc);
5342       break;
5343     case T_BYTE:
5344       evcvttpd2dqs(dst, src, vec_enc);
5345       evpmovdb(dst, dst, vec_enc);
5346       break;
5347     default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (mem src): %s", type2name(to_elem_bt));
5348   }
5349 }
5350 
5351 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5352                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5353                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5354   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5355   // and re-instantiate original MXCSR.RC mode after that.
5356   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5357 
5358   mov64(tmp, julong_cast(0.5L));
5359   evpbroadcastq(xtmp1, tmp, vec_enc);
5360   vaddpd(xtmp1, src , xtmp1, vec_enc);
5361   evcvtpd2qq(dst, xtmp1, vec_enc);
5362   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5363                                                 double_sign_flip, vec_enc);;
5364 
5365   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5366 }
5367 
5368 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5369                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5370                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5371   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5372   // and re-instantiate original MXCSR.RC mode after that.
5373   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5374 
5375   movl(tmp, jint_cast(0.5));
5376   movq(xtmp1, tmp);
5377   vbroadcastss(xtmp1, xtmp1, vec_enc);
5378   vaddps(xtmp1, src , xtmp1, vec_enc);
5379   vcvtps2dq(dst, xtmp1, vec_enc);
5380   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5381                                               float_sign_flip, vec_enc);
5382 
5383   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5384 }
5385 
5386 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5387                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5388                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5389   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5390   // and re-instantiate original MXCSR.RC mode after that.
5391   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5392 
5393   movl(tmp, jint_cast(0.5));
5394   movq(xtmp1, tmp);
5395   vbroadcastss(xtmp1, xtmp1, vec_enc);
5396   vaddps(xtmp1, src , xtmp1, vec_enc);
5397   vcvtps2dq(dst, xtmp1, vec_enc);
5398   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5399 
5400   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5401 }
5402 
5403 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5404                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5405   switch (from_elem_bt) {
5406     case T_BYTE:
5407       switch (to_elem_bt) {
5408         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5409         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5410         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5411         default: ShouldNotReachHere();
5412       }
5413       break;
5414     case T_SHORT:
5415       switch (to_elem_bt) {
5416         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5417         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5418         default: ShouldNotReachHere();
5419       }
5420       break;
5421     case T_INT:
5422       assert(to_elem_bt == T_LONG, "");
5423       vpmovzxdq(dst, src, vlen_enc);
5424       break;
5425     default:
5426       ShouldNotReachHere();
5427   }
5428 }
5429 
5430 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5431                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5432   switch (from_elem_bt) {
5433     case T_BYTE:
5434       switch (to_elem_bt) {
5435         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5436         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5437         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5438         default: ShouldNotReachHere();
5439       }
5440       break;
5441     case T_SHORT:
5442       switch (to_elem_bt) {
5443         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5444         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5445         default: ShouldNotReachHere();
5446       }
5447       break;
5448     case T_INT:
5449       assert(to_elem_bt == T_LONG, "");
5450       vpmovsxdq(dst, src, vlen_enc);
5451       break;
5452     default:
5453       ShouldNotReachHere();
5454   }
5455 }
5456 
5457 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5458                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5459   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5460   assert(vlen_enc != AVX_512bit, "");
5461 
5462   int dst_bt_size = type2aelembytes(dst_bt);
5463   int src_bt_size = type2aelembytes(src_bt);
5464   if (dst_bt_size > src_bt_size) {
5465     switch (dst_bt_size / src_bt_size) {
5466       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5467       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5468       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5469       default: ShouldNotReachHere();
5470     }
5471   } else {
5472     assert(dst_bt_size < src_bt_size, "");
5473     switch (src_bt_size / dst_bt_size) {
5474       case 2: {
5475         if (vlen_enc == AVX_128bit) {
5476           vpacksswb(dst, src, src, vlen_enc);
5477         } else {
5478           vpacksswb(dst, src, src, vlen_enc);
5479           vpermq(dst, dst, 0x08, vlen_enc);
5480         }
5481         break;
5482       }
5483       case 4: {
5484         if (vlen_enc == AVX_128bit) {
5485           vpackssdw(dst, src, src, vlen_enc);
5486           vpacksswb(dst, dst, dst, vlen_enc);
5487         } else {
5488           vpackssdw(dst, src, src, vlen_enc);
5489           vpermq(dst, dst, 0x08, vlen_enc);
5490           vpacksswb(dst, dst, dst, AVX_128bit);
5491         }
5492         break;
5493       }
5494       case 8: {
5495         if (vlen_enc == AVX_128bit) {
5496           vpshufd(dst, src, 0x08, vlen_enc);
5497           vpackssdw(dst, dst, dst, vlen_enc);
5498           vpacksswb(dst, dst, dst, vlen_enc);
5499         } else {
5500           vpshufd(dst, src, 0x08, vlen_enc);
5501           vpermq(dst, dst, 0x08, vlen_enc);
5502           vpackssdw(dst, dst, dst, AVX_128bit);
5503           vpacksswb(dst, dst, dst, AVX_128bit);
5504         }
5505         break;
5506       }
5507       default: ShouldNotReachHere();
5508     }
5509   }
5510 }
5511 
5512 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5513                                    bool merge, BasicType bt, int vlen_enc) {
5514   if (bt == T_INT) {
5515     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5516   } else {
5517     assert(bt == T_LONG, "");
5518     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5519   }
5520 }
5521 
5522 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5523                                    bool merge, BasicType bt, int vlen_enc) {
5524   if (bt == T_INT) {
5525     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5526   } else {
5527     assert(bt == T_LONG, "");
5528     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5529   }
5530 }
5531 
5532 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5533                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5534                                                int vec_enc) {
5535   int index = 0;
5536   int vindex = 0;
5537   mov64(rtmp1, 0x0101010101010101L);
5538   pdepq(rtmp1, src, rtmp1);
5539   if (mask_len > 8) {
5540     movq(rtmp2, src);
5541     vpxor(xtmp, xtmp, xtmp, vec_enc);
5542     movq(xtmp, rtmp1);
5543   }
5544   movq(dst, rtmp1);
5545 
5546   mask_len -= 8;
5547   while (mask_len > 0) {
5548     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5549     index++;
5550     if ((index % 2) == 0) {
5551       pxor(xtmp, xtmp);
5552     }
5553     mov64(rtmp1, 0x0101010101010101L);
5554     shrq(rtmp2, 8);
5555     pdepq(rtmp1, rtmp2, rtmp1);
5556     pinsrq(xtmp, rtmp1, index % 2);
5557     vindex = index / 2;
5558     if (vindex) {
5559       // Write entire 16 byte vector when both 64 bit
5560       // lanes are update to save redundant instructions.
5561       if (index % 2) {
5562         vinsertf128(dst, dst, xtmp, vindex);
5563       }
5564     } else {
5565       vmovdqu(dst, xtmp);
5566     }
5567     mask_len -= 8;
5568   }
5569 }
5570 
5571 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5572   switch(opc) {
5573     case Op_VectorMaskTrueCount:
5574       popcntq(dst, tmp);
5575       break;
5576     case Op_VectorMaskLastTrue:
5577       if (VM_Version::supports_lzcnt()) {
5578         lzcntq(tmp, tmp);
5579         movl(dst, 63);
5580         subl(dst, tmp);
5581       } else {
5582         movl(dst, -1);
5583         bsrq(tmp, tmp);
5584         cmov32(Assembler::notZero, dst, tmp);
5585       }
5586       break;
5587     case Op_VectorMaskFirstTrue:
5588       if (VM_Version::supports_bmi1()) {
5589         if (masklen < 32) {
5590           orl(tmp, 1 << masklen);
5591           tzcntl(dst, tmp);
5592         } else if (masklen == 32) {
5593           tzcntl(dst, tmp);
5594         } else {
5595           assert(masklen == 64, "");
5596           tzcntq(dst, tmp);
5597         }
5598       } else {
5599         if (masklen < 32) {
5600           orl(tmp, 1 << masklen);
5601           bsfl(dst, tmp);
5602         } else {
5603           assert(masklen == 32 || masklen == 64, "");
5604           movl(dst, masklen);
5605           if (masklen == 32)  {
5606             bsfl(tmp, tmp);
5607           } else {
5608             bsfq(tmp, tmp);
5609           }
5610           cmov32(Assembler::notZero, dst, tmp);
5611         }
5612       }
5613       break;
5614     case Op_VectorMaskToLong:
5615       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5616       break;
5617     default: assert(false, "Unhandled mask operation");
5618   }
5619 }
5620 
5621 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5622                                               int masklen, int masksize, int vec_enc) {
5623   assert(VM_Version::supports_popcnt(), "");
5624 
5625   if(VM_Version::supports_avx512bw()) {
5626     kmovql(tmp, mask);
5627   } else {
5628     assert(masklen <= 16, "");
5629     kmovwl(tmp, mask);
5630   }
5631 
5632   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5633   // operations needs to be clipped.
5634   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5635     andq(tmp, (1 << masklen) - 1);
5636   }
5637 
5638   vector_mask_operation_helper(opc, dst, tmp, masklen);
5639 }
5640 
5641 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5642                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5643   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5644          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5645   assert(VM_Version::supports_popcnt(), "");
5646 
5647   bool need_clip = false;
5648   switch(bt) {
5649     case T_BOOLEAN:
5650       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5651       vpxor(xtmp, xtmp, xtmp, vec_enc);
5652       vpsubb(xtmp, xtmp, mask, vec_enc);
5653       vpmovmskb(tmp, xtmp, vec_enc);
5654       need_clip = masklen < 16;
5655       break;
5656     case T_BYTE:
5657       vpmovmskb(tmp, mask, vec_enc);
5658       need_clip = masklen < 16;
5659       break;
5660     case T_SHORT:
5661       vpacksswb(xtmp, mask, mask, vec_enc);
5662       if (masklen >= 16) {
5663         vpermpd(xtmp, xtmp, 8, vec_enc);
5664       }
5665       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5666       need_clip = masklen < 16;
5667       break;
5668     case T_INT:
5669     case T_FLOAT:
5670       vmovmskps(tmp, mask, vec_enc);
5671       need_clip = masklen < 4;
5672       break;
5673     case T_LONG:
5674     case T_DOUBLE:
5675       vmovmskpd(tmp, mask, vec_enc);
5676       need_clip = masklen < 2;
5677       break;
5678     default: assert(false, "Unhandled type, %s", type2name(bt));
5679   }
5680 
5681   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5682   // operations needs to be clipped.
5683   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5684     // need_clip implies masklen < 32
5685     andq(tmp, (1 << masklen) - 1);
5686   }
5687 
5688   vector_mask_operation_helper(opc, dst, tmp, masklen);
5689 }
5690 
5691 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5692                                              Register rtmp2, int mask_len) {
5693   kmov(rtmp1, src);
5694   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5695   mov64(rtmp2, -1L);
5696   pextq(rtmp2, rtmp2, rtmp1);
5697   kmov(dst, rtmp2);
5698 }
5699 
5700 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5701                                                     XMMRegister mask, Register rtmp, Register rscratch,
5702                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5703                                                     int vec_enc) {
5704   assert(type2aelembytes(bt) >= 4, "");
5705   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5706   address compress_perm_table = nullptr;
5707   address expand_perm_table = nullptr;
5708   if (type2aelembytes(bt) == 8) {
5709     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5710     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5711     vmovmskpd(rtmp, mask, vec_enc);
5712   } else {
5713     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5714     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5715     vmovmskps(rtmp, mask, vec_enc);
5716   }
5717   shlq(rtmp, 5); // for 32 byte permute row.
5718   if (opcode == Op_CompressV) {
5719     lea(rscratch, ExternalAddress(compress_perm_table));
5720   } else {
5721     lea(rscratch, ExternalAddress(expand_perm_table));
5722   }
5723   addptr(rtmp, rscratch);
5724   vmovdqu(permv, Address(rtmp));
5725   vpermps(dst, permv, src, Assembler::AVX_256bit);
5726   vpxor(xtmp, xtmp, xtmp, vec_enc);
5727   // Blend the result with zero vector using permute mask, each column entry
5728   // in a permute table row contains either a valid permute index or a -1 (default)
5729   // value, this can potentially be used as a blending mask after
5730   // compressing/expanding the source vector lanes.
5731   vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv);
5732 }
5733 
5734 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5735                                                bool merge, BasicType bt, int vec_enc) {
5736   if (opcode == Op_CompressV) {
5737     switch(bt) {
5738     case T_BYTE:
5739       evpcompressb(dst, mask, src, merge, vec_enc);
5740       break;
5741     case T_CHAR:
5742     case T_SHORT:
5743       evpcompressw(dst, mask, src, merge, vec_enc);
5744       break;
5745     case T_INT:
5746       evpcompressd(dst, mask, src, merge, vec_enc);
5747       break;
5748     case T_FLOAT:
5749       evcompressps(dst, mask, src, merge, vec_enc);
5750       break;
5751     case T_LONG:
5752       evpcompressq(dst, mask, src, merge, vec_enc);
5753       break;
5754     case T_DOUBLE:
5755       evcompresspd(dst, mask, src, merge, vec_enc);
5756       break;
5757     default:
5758       fatal("Unsupported type %s", type2name(bt));
5759       break;
5760     }
5761   } else {
5762     assert(opcode == Op_ExpandV, "");
5763     switch(bt) {
5764     case T_BYTE:
5765       evpexpandb(dst, mask, src, merge, vec_enc);
5766       break;
5767     case T_CHAR:
5768     case T_SHORT:
5769       evpexpandw(dst, mask, src, merge, vec_enc);
5770       break;
5771     case T_INT:
5772       evpexpandd(dst, mask, src, merge, vec_enc);
5773       break;
5774     case T_FLOAT:
5775       evexpandps(dst, mask, src, merge, vec_enc);
5776       break;
5777     case T_LONG:
5778       evpexpandq(dst, mask, src, merge, vec_enc);
5779       break;
5780     case T_DOUBLE:
5781       evexpandpd(dst, mask, src, merge, vec_enc);
5782       break;
5783     default:
5784       fatal("Unsupported type %s", type2name(bt));
5785       break;
5786     }
5787   }
5788 }
5789 
5790 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5791                                            KRegister ktmp1, int vec_enc) {
5792   if (opcode == Op_SignumVD) {
5793     vsubpd(dst, zero, one, vec_enc);
5794     // if src < 0 ? -1 : 1
5795     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5796     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5797     // if src == NaN, -0.0 or 0.0 return src.
5798     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5799     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5800   } else {
5801     assert(opcode == Op_SignumVF, "");
5802     vsubps(dst, zero, one, vec_enc);
5803     // if src < 0 ? -1 : 1
5804     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5805     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5806     // if src == NaN, -0.0 or 0.0 return src.
5807     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5808     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5809   }
5810 }
5811 
5812 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5813                                           XMMRegister xtmp1, int vec_enc) {
5814   if (opcode == Op_SignumVD) {
5815     vsubpd(dst, zero, one, vec_enc);
5816     // if src < 0 ? -1 : 1
5817     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5818     // if src == NaN, -0.0 or 0.0 return src.
5819     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5820     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5821   } else {
5822     assert(opcode == Op_SignumVF, "");
5823     vsubps(dst, zero, one, vec_enc);
5824     // if src < 0 ? -1 : 1
5825     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5826     // if src == NaN, -0.0 or 0.0 return src.
5827     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5828     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5829   }
5830 }
5831 
5832 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5833   if (VM_Version::supports_avx512bw()) {
5834     if (mask_len > 32) {
5835       kmovql(dst, src);
5836     } else {
5837       kmovdl(dst, src);
5838       if (mask_len != 32) {
5839         kshiftrdl(dst, dst, 32 - mask_len);
5840       }
5841     }
5842   } else {
5843     assert(mask_len <= 16, "");
5844     kmovwl(dst, src);
5845     if (mask_len != 16) {
5846       kshiftrwl(dst, dst, 16 - mask_len);
5847     }
5848   }
5849 }
5850 
5851 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5852   int lane_size = type2aelembytes(bt);
5853   if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5854       (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) {
5855     movptr(rtmp, imm32);
5856     switch(lane_size) {
5857       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5858       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5859       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5860       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5861       fatal("Unsupported lane size %d", lane_size);
5862       break;
5863     }
5864   } else {
5865     movptr(rtmp, imm32);
5866     movq(dst, rtmp);
5867     switch(lane_size) {
5868       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5869       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5870       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5871       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5872       fatal("Unsupported lane size %d", lane_size);
5873       break;
5874     }
5875   }
5876 }
5877 
5878 //
5879 // Following is lookup table based popcount computation algorithm:-
5880 //       Index   Bit set count
5881 //     [ 0000 ->   0,
5882 //       0001 ->   1,
5883 //       0010 ->   1,
5884 //       0011 ->   2,
5885 //       0100 ->   1,
5886 //       0101 ->   2,
5887 //       0110 ->   2,
5888 //       0111 ->   3,
5889 //       1000 ->   1,
5890 //       1001 ->   2,
5891 //       1010 ->   3,
5892 //       1011 ->   3,
5893 //       1100 ->   2,
5894 //       1101 ->   3,
5895 //       1111 ->   4 ]
5896 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5897 //     shuffle indices for lookup table access.
5898 //  b. Right shift each byte of vector lane by 4 positions.
5899 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5900 //     shuffle indices for lookup table access.
5901 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5902 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5903 //     count of all the bytes of a quadword.
5904 //  f. Perform step e. for upper 128bit vector lane.
5905 //  g. Pack the bitset count of quadwords back to double word.
5906 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5907 
5908 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5909                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5910   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5911   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5912   vpsrlw(dst, src, 4, vec_enc);
5913   vpand(dst, dst, xtmp1, vec_enc);
5914   vpand(xtmp1, src, xtmp1, vec_enc);
5915   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5916   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5917   vpshufb(dst, xtmp2, dst, vec_enc);
5918   vpaddb(dst, dst, xtmp1, vec_enc);
5919 }
5920 
5921 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5922                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5923   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5924   // Following code is as per steps e,f,g and h of above algorithm.
5925   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5926   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5927   vpsadbw(dst, dst, xtmp2, vec_enc);
5928   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5929   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5930   vpackuswb(dst, xtmp1, dst, vec_enc);
5931 }
5932 
5933 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5934                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5935   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5936   // Add the popcount of upper and lower bytes of word.
5937   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5938   vpsrlw(dst, xtmp1, 8, vec_enc);
5939   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5940   vpaddw(dst, dst, xtmp1, vec_enc);
5941 }
5942 
5943 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5944                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5945   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5946   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5947   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5948 }
5949 
5950 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5951                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5952   switch(bt) {
5953     case T_LONG:
5954       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5955       break;
5956     case T_INT:
5957       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5958       break;
5959     case T_CHAR:
5960     case T_SHORT:
5961       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5962       break;
5963     case T_BYTE:
5964     case T_BOOLEAN:
5965       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5966       break;
5967     default:
5968       fatal("Unsupported type %s", type2name(bt));
5969       break;
5970   }
5971 }
5972 
5973 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5974                                                       KRegister mask, bool merge, int vec_enc) {
5975   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5976   switch(bt) {
5977     case T_LONG:
5978       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5979       evpopcntq(dst, mask, src, merge, vec_enc);
5980       break;
5981     case T_INT:
5982       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5983       evpopcntd(dst, mask, src, merge, vec_enc);
5984       break;
5985     case T_CHAR:
5986     case T_SHORT:
5987       assert(VM_Version::supports_avx512_bitalg(), "");
5988       evpopcntw(dst, mask, src, merge, vec_enc);
5989       break;
5990     case T_BYTE:
5991     case T_BOOLEAN:
5992       assert(VM_Version::supports_avx512_bitalg(), "");
5993       evpopcntb(dst, mask, src, merge, vec_enc);
5994       break;
5995     default:
5996       fatal("Unsupported type %s", type2name(bt));
5997       break;
5998   }
5999 }
6000 
6001 // Bit reversal algorithm first reverses the bits of each byte followed by
6002 // a byte level reversal for multi-byte primitive types (short/int/long).
6003 // Algorithm performs a lookup table access to get reverse bit sequence
6004 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
6005 // is obtained by swapping the reverse bit sequences of upper and lower
6006 // nibble of a byte.
6007 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6008                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
6009   if (VM_Version::supports_avx512vlbw()) {
6010 
6011     // Get the reverse bit sequence of lower nibble of each byte.
6012     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
6013     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6014     evpandq(dst, xtmp2, src, vec_enc);
6015     vpshufb(dst, xtmp1, dst, vec_enc);
6016     vpsllq(dst, dst, 4, vec_enc);
6017 
6018     // Get the reverse bit sequence of upper nibble of each byte.
6019     vpandn(xtmp2, xtmp2, src, vec_enc);
6020     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6021     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6022 
6023     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6024     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6025     evporq(xtmp2, dst, xtmp2, vec_enc);
6026     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6027 
6028   } else if(vec_enc == Assembler::AVX_512bit) {
6029     // Shift based bit reversal.
6030     assert(bt == T_LONG || bt == T_INT, "");
6031 
6032     // Swap lower and upper nibble of each byte.
6033     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
6034 
6035     // Swap two least and most significant bits of each nibble.
6036     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
6037 
6038     // Swap adjacent pair of bits.
6039     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6040     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
6041 
6042     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6043     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
6044   } else {
6045     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
6046     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6047 
6048     // Get the reverse bit sequence of lower nibble of each byte.
6049     vpand(dst, xtmp2, src, vec_enc);
6050     vpshufb(dst, xtmp1, dst, vec_enc);
6051     vpsllq(dst, dst, 4, vec_enc);
6052 
6053     // Get the reverse bit sequence of upper nibble of each byte.
6054     vpandn(xtmp2, xtmp2, src, vec_enc);
6055     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6056     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6057 
6058     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6059     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6060     vpor(xtmp2, dst, xtmp2, vec_enc);
6061     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6062   }
6063 }
6064 
6065 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
6066                                                 XMMRegister xtmp, Register rscratch) {
6067   assert(VM_Version::supports_gfni(), "");
6068   assert(rscratch != noreg || always_reachable(mask), "missing");
6069 
6070   // Galois field instruction based bit reversal based on following algorithm.
6071   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6072   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
6073   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
6074   vector_reverse_byte(bt, dst, xtmp, vec_enc);
6075 }
6076 
6077 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
6078                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
6079   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
6080   evpandq(dst, xtmp1, src, vec_enc);
6081   vpsllq(dst, dst, nbits, vec_enc);
6082   vpandn(xtmp1, xtmp1, src, vec_enc);
6083   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
6084   evporq(dst, dst, xtmp1, vec_enc);
6085 }
6086 
6087 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6088                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
6089   // Shift based bit reversal.
6090   assert(VM_Version::supports_evex(), "");
6091   switch(bt) {
6092     case T_LONG:
6093       // Swap upper and lower double word of each quad word.
6094       evprorq(xtmp1, k0, src, 32, true, vec_enc);
6095       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
6096       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6097       break;
6098     case T_INT:
6099       // Swap upper and lower word of each double word.
6100       evprord(xtmp1, k0, src, 16, true, vec_enc);
6101       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6102       break;
6103     case T_CHAR:
6104     case T_SHORT:
6105       // Swap upper and lower byte of each word.
6106       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6107       break;
6108     case T_BYTE:
6109       evmovdquq(dst, k0, src, true, vec_enc);
6110       break;
6111     default:
6112       fatal("Unsupported type %s", type2name(bt));
6113       break;
6114   }
6115 }
6116 
6117 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6118   if (bt == T_BYTE) {
6119     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6120       evmovdquq(dst, k0, src, true, vec_enc);
6121     } else {
6122       vmovdqu(dst, src);
6123     }
6124     return;
6125   }
6126   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6127   // pre-computed shuffle indices.
6128   switch(bt) {
6129     case T_LONG:
6130       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6131       break;
6132     case T_INT:
6133       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6134       break;
6135     case T_CHAR:
6136     case T_SHORT:
6137       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6138       break;
6139     default:
6140       fatal("Unsupported type %s", type2name(bt));
6141       break;
6142   }
6143   vpshufb(dst, src, dst, vec_enc);
6144 }
6145 
6146 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6147                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6148                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6149   assert(is_integral_type(bt), "");
6150   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6151   assert(VM_Version::supports_avx512cd(), "");
6152   switch(bt) {
6153     case T_LONG:
6154       evplzcntq(dst, ktmp, src, merge, vec_enc);
6155       break;
6156     case T_INT:
6157       evplzcntd(dst, ktmp, src, merge, vec_enc);
6158       break;
6159     case T_SHORT:
6160       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6161       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6162       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6163       vpunpckhwd(dst, xtmp1, src, vec_enc);
6164       evplzcntd(dst, ktmp, dst, merge, vec_enc);
6165       vpackusdw(dst, xtmp2, dst, vec_enc);
6166       break;
6167     case T_BYTE:
6168       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6169       // accessing the lookup table.
6170       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6171       // accessing the lookup table.
6172       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6173       assert(VM_Version::supports_avx512bw(), "");
6174       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6175       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6176       vpand(xtmp2, dst, src, vec_enc);
6177       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6178       vpsrlw(xtmp3, src, 4, vec_enc);
6179       vpand(xtmp3, dst, xtmp3, vec_enc);
6180       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6181       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6182       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6183       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6184       break;
6185     default:
6186       fatal("Unsupported type %s", type2name(bt));
6187       break;
6188   }
6189 }
6190 
6191 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6192                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6193   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6194   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6195   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6196   // accessing the lookup table.
6197   vpand(dst, xtmp2, src, vec_enc);
6198   vpshufb(dst, xtmp1, dst, vec_enc);
6199   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6200   // accessing the lookup table.
6201   vpsrlw(xtmp3, src, 4, vec_enc);
6202   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6203   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6204   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6205   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6206   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6207   vpaddb(dst, dst, xtmp2, vec_enc);
6208   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6209 }
6210 
6211 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6212                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6213   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6214   // Add zero counts of lower byte and upper byte of a word if
6215   // upper byte holds a zero value.
6216   vpsrlw(xtmp3, src, 8, vec_enc);
6217   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6218   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6219   vpsllw(xtmp2, dst, 8, vec_enc);
6220   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6221   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6222   vpsrlw(dst, dst, 8, vec_enc);
6223 }
6224 
6225 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6226                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6227   // By converting the integer to a float, we can obtain the number of leading zeros based on the exponent of the float.
6228   // As the float exponent contains a bias of 127 for nonzero values, the bias must be removed before interpreting the
6229   // exponent as the leading zero count.
6230 
6231   // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher
6232   // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit
6233   // contributes to the leading number of zeros.
6234   vpsrld(dst, src, 1, vec_enc);
6235   vpandn(dst, dst, src, vec_enc);
6236 
6237   vcvtdq2ps(dst, dst, vec_enc);
6238 
6239   // By comparing the register to itself, all the bits in the destination are set.
6240   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6241 
6242   // Move the biased exponent to the low end of the lane and mask with 0xFF to discard the sign bit.
6243   vpsrld(xtmp2, xtmp1, 24, vec_enc);
6244   vpsrld(dst, dst, 23, vec_enc);
6245   vpand(dst, xtmp2, dst, vec_enc);
6246 
6247   // Subtract 127 from the exponent, which removes the bias from the exponent.
6248   vpsrld(xtmp2, xtmp1, 25, vec_enc);
6249   vpsubd(dst, dst, xtmp2, vec_enc);
6250 
6251   vpsrld(xtmp2, xtmp1, 27, vec_enc);
6252 
6253   // If the original value is 0 the exponent would not have bias, so the subtraction creates a negative number. If this
6254   // is found in any of the lanes, replace the lane with -1 from xtmp1.
6255   vblendvps(dst, dst, xtmp1, dst, vec_enc, true, xtmp3);
6256 
6257   // If the original value is negative, replace the lane with 31.
6258   vblendvps(dst, dst, xtmp2, src, vec_enc, true, xtmp3);
6259 
6260   // Subtract the exponent from 31, giving the final result. For 0, the result is 32 as the exponent was replaced with -1,
6261   // and for negative numbers the result is 0 as the exponent was replaced with 31.
6262   vpsubd(dst, xtmp2, dst, vec_enc);
6263 }
6264 
6265 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6266                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6267   // Find the leading zeros of the top and bottom halves of the long individually.
6268   vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6269 
6270   // Move the top half result to the bottom half of xtmp1, setting the top half to 0.
6271   vpsrlq(xtmp1, dst, 32, vec_enc);
6272   // By moving the top half result to the right by 6 bits, if the top half was empty (i.e. 32 is returned) the result bit will
6273   // be in the most significant position of the bottom half.
6274   vpsrlq(xtmp2, dst, 6, vec_enc);
6275 
6276   // In the bottom half, add the top half and bottom half results.
6277   vpaddq(dst, xtmp1, dst, vec_enc);
6278 
6279   // For the bottom half, choose between the values using the most significant bit of xtmp2.
6280   // If the MSB is set, then bottom+top in dst is the resulting value. If the top half is less than 32 xtmp1 is chosen,
6281   // which contains only the top half result.
6282   // In the top half the MSB is always zero, so the value in xtmp1 is always chosen. This value is always 0, which clears
6283   // the lane as required.
6284   vblendvps(dst, xtmp1, dst, xtmp2, vec_enc, true, xtmp3);
6285 }
6286 
6287 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6288                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6289                                                        Register rtmp, int vec_enc) {
6290   assert(is_integral_type(bt), "unexpected type");
6291   assert(vec_enc < Assembler::AVX_512bit, "");
6292   switch(bt) {
6293     case T_LONG:
6294       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6295       break;
6296     case T_INT:
6297       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6298       break;
6299     case T_SHORT:
6300       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6301       break;
6302     case T_BYTE:
6303       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6304       break;
6305     default:
6306       fatal("Unsupported type %s", type2name(bt));
6307       break;
6308   }
6309 }
6310 
6311 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6312   switch(bt) {
6313     case T_BYTE:
6314       vpsubb(dst, src1, src2, vec_enc);
6315       break;
6316     case T_SHORT:
6317       vpsubw(dst, src1, src2, vec_enc);
6318       break;
6319     case T_INT:
6320       vpsubd(dst, src1, src2, vec_enc);
6321       break;
6322     case T_LONG:
6323       vpsubq(dst, src1, src2, vec_enc);
6324       break;
6325     default:
6326       fatal("Unsupported type %s", type2name(bt));
6327       break;
6328   }
6329 }
6330 
6331 // Trailing zero count computation is based on leading zero count operation as per
6332 // following equation. All AVX3 targets support AVX512CD feature which offers
6333 // direct vector instruction to compute leading zero count.
6334 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6335 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6336                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6337                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6338   assert(is_integral_type(bt), "");
6339   // xtmp = -1
6340   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6341   // xtmp = xtmp + src
6342   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6343   // xtmp = xtmp & ~src
6344   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6345   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6346   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6347   vpsub(bt, dst, xtmp4, dst, vec_enc);
6348 }
6349 
6350 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6351 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6352 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6353                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6354   assert(is_integral_type(bt), "");
6355   // xtmp = 0
6356   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6357   // xtmp = 0 - src
6358   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6359   // xtmp = xtmp | src
6360   vpor(xtmp3, xtmp3, src, vec_enc);
6361   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6362   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6363   vpsub(bt, dst, xtmp1, dst, vec_enc);
6364 }
6365 
6366 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6367   Label done;
6368   Label neg_divisor_fastpath;
6369   cmpl(divisor, 0);
6370   jccb(Assembler::less, neg_divisor_fastpath);
6371   xorl(rdx, rdx);
6372   divl(divisor);
6373   jmpb(done);
6374   bind(neg_divisor_fastpath);
6375   // Fastpath for divisor < 0:
6376   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6377   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6378   movl(rdx, rax);
6379   subl(rdx, divisor);
6380   if (VM_Version::supports_bmi1()) {
6381     andnl(rax, rdx, rax);
6382   } else {
6383     notl(rdx);
6384     andl(rax, rdx);
6385   }
6386   shrl(rax, 31);
6387   bind(done);
6388 }
6389 
6390 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6391   Label done;
6392   Label neg_divisor_fastpath;
6393   cmpl(divisor, 0);
6394   jccb(Assembler::less, neg_divisor_fastpath);
6395   xorl(rdx, rdx);
6396   divl(divisor);
6397   jmpb(done);
6398   bind(neg_divisor_fastpath);
6399   // Fastpath when divisor < 0:
6400   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6401   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6402   movl(rdx, rax);
6403   subl(rax, divisor);
6404   if (VM_Version::supports_bmi1()) {
6405     andnl(rax, rax, rdx);
6406   } else {
6407     notl(rax);
6408     andl(rax, rdx);
6409   }
6410   sarl(rax, 31);
6411   andl(rax, divisor);
6412   subl(rdx, rax);
6413   bind(done);
6414 }
6415 
6416 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6417   Label done;
6418   Label neg_divisor_fastpath;
6419 
6420   cmpl(divisor, 0);
6421   jccb(Assembler::less, neg_divisor_fastpath);
6422   xorl(rdx, rdx);
6423   divl(divisor);
6424   jmpb(done);
6425   bind(neg_divisor_fastpath);
6426   // Fastpath for divisor < 0:
6427   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6428   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6429   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6430   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6431   movl(rdx, rax);
6432   subl(rax, divisor);
6433   if (VM_Version::supports_bmi1()) {
6434     andnl(rax, rax, rdx);
6435   } else {
6436     notl(rax);
6437     andl(rax, rdx);
6438   }
6439   movl(tmp, rax);
6440   shrl(rax, 31); // quotient
6441   sarl(tmp, 31);
6442   andl(tmp, divisor);
6443   subl(rdx, tmp); // remainder
6444   bind(done);
6445 }
6446 
6447 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6448                                  XMMRegister xtmp2, Register rtmp) {
6449   if(VM_Version::supports_gfni()) {
6450     // Galois field instruction based bit reversal based on following algorithm.
6451     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6452     mov64(rtmp, 0x8040201008040201L);
6453     movq(xtmp1, src);
6454     movq(xtmp2, rtmp);
6455     gf2p8affineqb(xtmp1, xtmp2, 0);
6456     movq(dst, xtmp1);
6457   } else {
6458     // Swap even and odd numbered bits.
6459     movl(rtmp, src);
6460     andl(rtmp, 0x55555555);
6461     shll(rtmp, 1);
6462     movl(dst, src);
6463     andl(dst, 0xAAAAAAAA);
6464     shrl(dst, 1);
6465     orl(dst, rtmp);
6466 
6467     // Swap LSB and MSB 2 bits of each nibble.
6468     movl(rtmp, dst);
6469     andl(rtmp, 0x33333333);
6470     shll(rtmp, 2);
6471     andl(dst, 0xCCCCCCCC);
6472     shrl(dst, 2);
6473     orl(dst, rtmp);
6474 
6475     // Swap LSB and MSB 4 bits of each byte.
6476     movl(rtmp, dst);
6477     andl(rtmp, 0x0F0F0F0F);
6478     shll(rtmp, 4);
6479     andl(dst, 0xF0F0F0F0);
6480     shrl(dst, 4);
6481     orl(dst, rtmp);
6482   }
6483   bswapl(dst);
6484 }
6485 
6486 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6487                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6488   if(VM_Version::supports_gfni()) {
6489     // Galois field instruction based bit reversal based on following algorithm.
6490     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6491     mov64(rtmp1, 0x8040201008040201L);
6492     movq(xtmp1, src);
6493     movq(xtmp2, rtmp1);
6494     gf2p8affineqb(xtmp1, xtmp2, 0);
6495     movq(dst, xtmp1);
6496   } else {
6497     // Swap even and odd numbered bits.
6498     movq(rtmp1, src);
6499     mov64(rtmp2, 0x5555555555555555L);
6500     andq(rtmp1, rtmp2);
6501     shlq(rtmp1, 1);
6502     movq(dst, src);
6503     notq(rtmp2);
6504     andq(dst, rtmp2);
6505     shrq(dst, 1);
6506     orq(dst, rtmp1);
6507 
6508     // Swap LSB and MSB 2 bits of each nibble.
6509     movq(rtmp1, dst);
6510     mov64(rtmp2, 0x3333333333333333L);
6511     andq(rtmp1, rtmp2);
6512     shlq(rtmp1, 2);
6513     notq(rtmp2);
6514     andq(dst, rtmp2);
6515     shrq(dst, 2);
6516     orq(dst, rtmp1);
6517 
6518     // Swap LSB and MSB 4 bits of each byte.
6519     movq(rtmp1, dst);
6520     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6521     andq(rtmp1, rtmp2);
6522     shlq(rtmp1, 4);
6523     notq(rtmp2);
6524     andq(dst, rtmp2);
6525     shrq(dst, 4);
6526     orq(dst, rtmp1);
6527   }
6528   bswapq(dst);
6529 }
6530 
6531 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6532   Label done;
6533   Label neg_divisor_fastpath;
6534   cmpq(divisor, 0);
6535   jccb(Assembler::less, neg_divisor_fastpath);
6536   xorl(rdx, rdx);
6537   divq(divisor);
6538   jmpb(done);
6539   bind(neg_divisor_fastpath);
6540   // Fastpath for divisor < 0:
6541   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6542   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6543   movq(rdx, rax);
6544   subq(rdx, divisor);
6545   if (VM_Version::supports_bmi1()) {
6546     andnq(rax, rdx, rax);
6547   } else {
6548     notq(rdx);
6549     andq(rax, rdx);
6550   }
6551   shrq(rax, 63);
6552   bind(done);
6553 }
6554 
6555 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6556   Label done;
6557   Label neg_divisor_fastpath;
6558   cmpq(divisor, 0);
6559   jccb(Assembler::less, neg_divisor_fastpath);
6560   xorq(rdx, rdx);
6561   divq(divisor);
6562   jmp(done);
6563   bind(neg_divisor_fastpath);
6564   // Fastpath when divisor < 0:
6565   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6566   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6567   movq(rdx, rax);
6568   subq(rax, divisor);
6569   if (VM_Version::supports_bmi1()) {
6570     andnq(rax, rax, rdx);
6571   } else {
6572     notq(rax);
6573     andq(rax, rdx);
6574   }
6575   sarq(rax, 63);
6576   andq(rax, divisor);
6577   subq(rdx, rax);
6578   bind(done);
6579 }
6580 
6581 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6582   Label done;
6583   Label neg_divisor_fastpath;
6584   cmpq(divisor, 0);
6585   jccb(Assembler::less, neg_divisor_fastpath);
6586   xorq(rdx, rdx);
6587   divq(divisor);
6588   jmp(done);
6589   bind(neg_divisor_fastpath);
6590   // Fastpath for divisor < 0:
6591   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6592   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6593   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6594   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6595   movq(rdx, rax);
6596   subq(rax, divisor);
6597   if (VM_Version::supports_bmi1()) {
6598     andnq(rax, rax, rdx);
6599   } else {
6600     notq(rax);
6601     andq(rax, rdx);
6602   }
6603   movq(tmp, rax);
6604   shrq(rax, 63); // quotient
6605   sarq(tmp, 63);
6606   andq(tmp, divisor);
6607   subq(rdx, tmp); // remainder
6608   bind(done);
6609 }
6610 
6611 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6612                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6613                                         int vlen_enc) {
6614   assert(VM_Version::supports_avx512bw(), "");
6615   // Byte shuffles are inlane operations and indices are determined using
6616   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6617   // normalized to index range 0-15. This makes sure that all the multiples
6618   // of an index value are placed at same relative position in 128 bit
6619   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6620   // will be 16th element in their respective 128 bit lanes.
6621   movl(rtmp, 16);
6622   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6623 
6624   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6625   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6626   // original shuffle indices and move the shuffled lanes corresponding to true
6627   // mask to destination vector.
6628   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6629   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6630   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6631 
6632   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6633   // and broadcasting second 128 bit lane.
6634   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6635   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6636   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6637   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6638   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6639 
6640   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6641   // and broadcasting third 128 bit lane.
6642   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6643   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6644   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6645   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6646   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6647 
6648   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6649   // and broadcasting third 128 bit lane.
6650   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6651   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6652   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6653   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6654   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6655 }
6656 
6657 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6658                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6659   if (vlen_enc == AVX_128bit) {
6660     vpermilps(dst, src, shuffle, vlen_enc);
6661   } else if (bt == T_INT) {
6662     vpermd(dst, shuffle, src, vlen_enc);
6663   } else {
6664     assert(bt == T_FLOAT, "");
6665     vpermps(dst, shuffle, src, vlen_enc);
6666   }
6667 }
6668 
6669 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6670   switch(opcode) {
6671     case Op_AddHF: vaddsh(dst, src1, src2); break;
6672     case Op_SubHF: vsubsh(dst, src1, src2); break;
6673     case Op_MulHF: vmulsh(dst, src1, src2); break;
6674     case Op_DivHF: vdivsh(dst, src1, src2); break;
6675     default: assert(false, "%s", NodeClassNames[opcode]); break;
6676   }
6677 }
6678 
6679 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6680   switch(elem_bt) {
6681     case T_BYTE:
6682       if (ideal_opc == Op_SaturatingAddV) {
6683         vpaddsb(dst, src1, src2, vlen_enc);
6684       } else {
6685         assert(ideal_opc == Op_SaturatingSubV, "");
6686         vpsubsb(dst, src1, src2, vlen_enc);
6687       }
6688       break;
6689     case T_SHORT:
6690       if (ideal_opc == Op_SaturatingAddV) {
6691         vpaddsw(dst, src1, src2, vlen_enc);
6692       } else {
6693         assert(ideal_opc == Op_SaturatingSubV, "");
6694         vpsubsw(dst, src1, src2, vlen_enc);
6695       }
6696       break;
6697     default:
6698       fatal("Unsupported type %s", type2name(elem_bt));
6699       break;
6700   }
6701 }
6702 
6703 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6704   switch(elem_bt) {
6705     case T_BYTE:
6706       if (ideal_opc == Op_SaturatingAddV) {
6707         vpaddusb(dst, src1, src2, vlen_enc);
6708       } else {
6709         assert(ideal_opc == Op_SaturatingSubV, "");
6710         vpsubusb(dst, src1, src2, vlen_enc);
6711       }
6712       break;
6713     case T_SHORT:
6714       if (ideal_opc == Op_SaturatingAddV) {
6715         vpaddusw(dst, src1, src2, vlen_enc);
6716       } else {
6717         assert(ideal_opc == Op_SaturatingSubV, "");
6718         vpsubusw(dst, src1, src2, vlen_enc);
6719       }
6720       break;
6721     default:
6722       fatal("Unsupported type %s", type2name(elem_bt));
6723       break;
6724   }
6725 }
6726 
6727 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6728                                                               XMMRegister src2, KRegister ktmp, int vlen_enc) {
6729   // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6730   // overflow_mask = Inp1 <u Inp2
6731   evpcmpu(elem_bt, ktmp,  src2, src1, Assembler::lt, vlen_enc);
6732   // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6733   evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6734 }
6735 
6736 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6737                                                               XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6738   // Emulate unsigned comparison using signed comparison
6739   // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6740   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6741   vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6742   vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6743 
6744   vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6745 
6746   // Res = INP1 - INP2 (non-commutative and non-associative)
6747   vpsub(elem_bt, dst, src1, src2, vlen_enc);
6748   // Res = Mask ? Zero : Res
6749   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6750   vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6751 }
6752 
6753 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6754                                                                XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6755   // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6756   // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6757   // Res = Signed Add INP1, INP2
6758   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6759   // T1 = SRC1 | SRC2
6760   vpor(xtmp1, src1, src2, vlen_enc);
6761   // Max_Unsigned = -1
6762   vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6763   // Unsigned compare:  Mask = Res <u T1
6764   evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6765   // res  = Mask ? Max_Unsigned : Res
6766   evpblend(elem_bt, dst, ktmp,  dst, xtmp2, true, vlen_enc);
6767 }
6768 
6769 //
6770 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6771 // unsigned addition operation.
6772 //    overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6773 //
6774 // We empirically determined its semantic equivalence to following reduced expression
6775 //    overflow_mask =  (a + b) <u (a | b)
6776 //
6777 // and also verified it though Alive2 solver.
6778 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6779 //
6780 
6781 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6782                                                               XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6783   // Res = Signed Add INP1, INP2
6784   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6785   // Compute T1 = INP1 | INP2
6786   vpor(xtmp3, src1, src2, vlen_enc);
6787   // T1 = Minimum signed value.
6788   vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6789   // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6790   vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6791   // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6792   vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6793   // Compute overflow detection mask = Res<1> <s T1
6794   if (elem_bt == T_INT) {
6795     vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6796   } else {
6797     assert(elem_bt == T_LONG, "");
6798     vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6799   }
6800   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6801 }
6802 
6803 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6804                                       int vlen_enc, bool xtmp2_hold_M1) {
6805   if (VM_Version::supports_avx512dq()) {
6806     evpmovq2m(ktmp, src, vlen_enc);
6807   } else {
6808     assert(VM_Version::supports_evex(), "");
6809     if (!xtmp2_hold_M1) {
6810       vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6811     }
6812     evpsraq(xtmp1, src, 63, vlen_enc);
6813     evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6814   }
6815 }
6816 
6817 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6818                                       int vlen_enc, bool xtmp2_hold_M1) {
6819   if (VM_Version::supports_avx512dq()) {
6820     evpmovd2m(ktmp, src, vlen_enc);
6821   } else {
6822     assert(VM_Version::supports_evex(), "");
6823     if (!xtmp2_hold_M1) {
6824       vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6825     }
6826     vpsrad(xtmp1, src, 31, vlen_enc);
6827     Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6828   }
6829 }
6830 
6831 
6832 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6833   if (elem_bt == T_LONG) {
6834     if (VM_Version::supports_evex()) {
6835       evpsraq(dst, src, 63, vlen_enc);
6836     } else {
6837       vpsrad(dst, src, 31, vlen_enc);
6838       vpshufd(dst, dst, 0xF5, vlen_enc);
6839     }
6840   } else {
6841     assert(elem_bt == T_INT, "");
6842     vpsrad(dst, src, 31, vlen_enc);
6843   }
6844 }
6845 
6846 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6847   if (compute_allones) {
6848     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6849       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6850     } else {
6851       vpcmpeqq(allones, allones, allones, vlen_enc);
6852     }
6853   }
6854   if (elem_bt == T_LONG) {
6855     vpsrlq(dst, allones, 1, vlen_enc);
6856   } else {
6857     assert(elem_bt == T_INT, "");
6858     vpsrld(dst, allones, 1, vlen_enc);
6859   }
6860 }
6861 
6862 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6863   if (compute_allones) {
6864     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6865       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6866     } else {
6867       vpcmpeqq(allones, allones, allones, vlen_enc);
6868     }
6869   }
6870   if (elem_bt == T_LONG) {
6871     vpsllq(dst, allones, 63, vlen_enc);
6872   } else {
6873     assert(elem_bt == T_INT, "");
6874     vpslld(dst, allones, 31, vlen_enc);
6875   }
6876 }
6877 
6878 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask,  XMMRegister src1, XMMRegister src2,
6879                                 Assembler::ComparisonPredicate cond, int vlen_enc) {
6880   switch(elem_bt) {
6881     case T_LONG:  evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6882     case T_INT:   evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6883     case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6884     case T_BYTE:  evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6885     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6886   }
6887 }
6888 
6889 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6890   switch(elem_bt) {
6891     case  T_LONG:  vpcmpgtq(dst, src1, src2, vlen_enc); break;
6892     case  T_INT:   vpcmpgtd(dst, src1, src2, vlen_enc); break;
6893     case  T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6894     case  T_BYTE:  vpcmpgtb(dst, src1, src2, vlen_enc); break;
6895     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6896   }
6897 }
6898 
6899 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6900                                            XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6901   if (elem_bt == T_LONG) {
6902     evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6903   } else {
6904     assert(elem_bt == T_INT, "");
6905     evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6906   }
6907 }
6908 
6909 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6910                                                          XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6911                                                          KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6912   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6913   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6914   // Overflow detection based on Hacker's delight section 2-13.
6915   if (ideal_opc == Op_SaturatingAddV) {
6916     // res = src1 + src2
6917     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6918     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6919     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6920     vpxor(xtmp1, dst, src1, vlen_enc);
6921     vpxor(xtmp2, dst, src2, vlen_enc);
6922     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6923   } else {
6924     assert(ideal_opc == Op_SaturatingSubV, "");
6925     // res = src1 - src2
6926     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6927     // Overflow occurs when both inputs have opposite polarity and
6928     // result polarity does not comply with first input polarity.
6929     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6930     vpxor(xtmp1, src1, src2, vlen_enc);
6931     vpxor(xtmp2, dst, src1, vlen_enc);
6932     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6933   }
6934 
6935   // Compute overflow detection mask.
6936   evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6937   // Note: xtmp1 hold -1 in all its lanes after above call.
6938 
6939   // Compute mask based on first input polarity.
6940   evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6941 
6942   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6943   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6944 
6945   // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6946   // set bits in first input polarity mask holds a min value.
6947   evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6948   // Blend destination lanes with saturated values using overflow detection mask.
6949   evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6950 }
6951 
6952 
6953 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6954                                                         XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6955                                                         XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6956   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6957   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6958   // Overflow detection based on Hacker's delight section 2-13.
6959   if (ideal_opc == Op_SaturatingAddV) {
6960     // res = src1 + src2
6961     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6962     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6963     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6964     vpxor(xtmp1, dst, src1, vlen_enc);
6965     vpxor(xtmp2, dst, src2, vlen_enc);
6966     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6967   } else {
6968     assert(ideal_opc == Op_SaturatingSubV, "");
6969     // res = src1 - src2
6970     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6971     // Overflow occurs when both inputs have opposite polarity and
6972     // result polarity does not comply with first input polarity.
6973     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6974     vpxor(xtmp1, src1, src2, vlen_enc);
6975     vpxor(xtmp2, dst, src1, vlen_enc);
6976     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6977   }
6978 
6979   // Sign-extend to compute overflow detection mask.
6980   vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
6981 
6982   vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
6983   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
6984   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6985 
6986   // Compose saturating min/max vector using first input polarity mask.
6987   vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
6988   vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
6989 
6990   // Blend result with saturating vector using overflow detection mask.
6991   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6992 }
6993 
6994 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6995   switch(elem_bt) {
6996     case T_BYTE:
6997       if (ideal_opc == Op_SaturatingAddV) {
6998         vpaddsb(dst, src1, src2, vlen_enc);
6999       } else {
7000         assert(ideal_opc == Op_SaturatingSubV, "");
7001         vpsubsb(dst, src1, src2, vlen_enc);
7002       }
7003       break;
7004     case T_SHORT:
7005       if (ideal_opc == Op_SaturatingAddV) {
7006         vpaddsw(dst, src1, src2, vlen_enc);
7007       } else {
7008         assert(ideal_opc == Op_SaturatingSubV, "");
7009         vpsubsw(dst, src1, src2, vlen_enc);
7010       }
7011       break;
7012     default:
7013       fatal("Unsupported type %s", type2name(elem_bt));
7014       break;
7015   }
7016 }
7017 
7018 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7019   switch(elem_bt) {
7020     case T_BYTE:
7021       if (ideal_opc == Op_SaturatingAddV) {
7022         vpaddusb(dst, src1, src2, vlen_enc);
7023       } else {
7024         assert(ideal_opc == Op_SaturatingSubV, "");
7025         vpsubusb(dst, src1, src2, vlen_enc);
7026       }
7027       break;
7028     case T_SHORT:
7029       if (ideal_opc == Op_SaturatingAddV) {
7030         vpaddusw(dst, src1, src2, vlen_enc);
7031       } else {
7032         assert(ideal_opc == Op_SaturatingSubV, "");
7033         vpsubusw(dst, src1, src2, vlen_enc);
7034       }
7035       break;
7036     default:
7037       fatal("Unsupported type %s", type2name(elem_bt));
7038       break;
7039   }
7040 }
7041 
7042 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
7043                                                      XMMRegister src2, int vlen_enc) {
7044   switch(elem_bt) {
7045     case T_BYTE:
7046       evpermi2b(dst, src1, src2, vlen_enc);
7047       break;
7048     case T_SHORT:
7049       evpermi2w(dst, src1, src2, vlen_enc);
7050       break;
7051     case T_INT:
7052       evpermi2d(dst, src1, src2, vlen_enc);
7053       break;
7054     case T_LONG:
7055       evpermi2q(dst, src1, src2, vlen_enc);
7056       break;
7057     case T_FLOAT:
7058       evpermi2ps(dst, src1, src2, vlen_enc);
7059       break;
7060     case T_DOUBLE:
7061       evpermi2pd(dst, src1, src2, vlen_enc);
7062       break;
7063     default:
7064       fatal("Unsupported type %s", type2name(elem_bt));
7065       break;
7066   }
7067 }
7068 
7069 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
7070   if (is_unsigned) {
7071     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7072   } else {
7073     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7074   }
7075 }
7076 
7077 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
7078   if (is_unsigned) {
7079     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7080   } else {
7081     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7082   }
7083 }
7084 
7085 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
7086   switch(opcode) {
7087     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7088     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7089     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7090     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7091     default: assert(false, "%s", NodeClassNames[opcode]); break;
7092   }
7093 }
7094 
7095 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7096   switch(opcode) {
7097     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7098     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7099     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7100     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7101     default: assert(false, "%s", NodeClassNames[opcode]); break;
7102   }
7103 }
7104 
7105 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7106                                             KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) {
7107   vector_max_min_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit);
7108 }
7109 
7110 void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7111                                             KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
7112   if (opcode == Op_MaxVHF || opcode == Op_MaxHF) {
7113     // Move sign bits of src2 to mask register.
7114     evpmovw2m(ktmp, src2, vlen_enc);
7115     // xtmp1 = src2 < 0 ? src2 : src1
7116     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7117     // xtmp2 = src2 < 0 ? ? src1 : src2
7118     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7119     // Idea behind above swapping is to make seconds source operand a +ve value.
7120     // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in
7121     // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction,
7122     // the second source operand, either a NaN or a valid floating-point value, is returned
7123     // dst = max(xtmp1, xtmp2)
7124     evmaxph(dst, xtmp1, xtmp2, vlen_enc);
7125     // isNaN = is_unordered_quiet(xtmp1)
7126     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7127     // Final result is same as first source if its a NaN value,
7128     // in case second operand holds a NaN value then as per above semantics
7129     // result is same as second operand.
7130     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7131   } else {
7132     assert(opcode == Op_MinVHF || opcode == Op_MinHF, "");
7133     // Move sign bits of src1 to mask register.
7134     evpmovw2m(ktmp, src1, vlen_enc);
7135     // xtmp1 = src1 < 0 ? src2 : src1
7136     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7137     // xtmp2 = src1 < 0 ? src1 : src2
7138     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7139     // Idea behind above swapping is to make seconds source operand a -ve value.
7140     // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in
7141     // the second source operand is returned.
7142     // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN
7143     // or a valid floating-point value, is written to the result.
7144     // dst = min(xtmp1, xtmp2)
7145     evminph(dst, xtmp1, xtmp2, vlen_enc);
7146     // isNaN = is_unordered_quiet(xtmp1)
7147     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7148     // Final result is same as first source if its a NaN value,
7149     // in case second operand holds a NaN value then as per above semantics
7150     // result is same as second operand.
7151     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7152   }
7153 }