1 /*
   2  * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "gc/shared/barrierSet.hpp"
  28 #include "gc/shared/barrierSetAssembler.hpp"
  29 #include "oops/methodData.hpp"
  30 #include "opto/c2_MacroAssembler.hpp"
  31 #include "opto/intrinsicnode.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/opcodes.hpp"
  34 #include "opto/subnode.hpp"
  35 #include "runtime/globals.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/objectMonitorTable.hpp"
  38 #include "runtime/stubRoutines.hpp"
  39 #include "runtime/synchronizer.hpp"
  40 #include "utilities/checkedCast.hpp"
  41 #include "utilities/globalDefinitions.hpp"
  42 #include "utilities/powerOfTwo.hpp"
  43 #include "utilities/sizes.hpp"
  44 
  45 #ifdef PRODUCT
  46 #define BLOCK_COMMENT(str) /* nothing */
  47 #define STOP(error) stop(error)
  48 #else
  49 #define BLOCK_COMMENT(str) block_comment(str)
  50 #define STOP(error) block_comment(error); stop(error)
  51 #endif
  52 
  53 // C2 compiled method's prolog code.
  54 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
  55   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  56 
  57   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  58   // Remove word for return addr
  59   framesize -= wordSize;
  60   stack_bang_size -= wordSize;
  61 
  62   // Calls to C2R adapters often do not accept exceptional returns.
  63   // We require that their callers must bang for them.  But be careful, because
  64   // some VM calls (such as call site linkage) can use several kilobytes of
  65   // stack.  But the stack safety zone should account for that.
  66   // See bugs 4446381, 4468289, 4497237.
  67   if (stack_bang_size > 0) {
  68     generate_stack_overflow_check(stack_bang_size);
  69 
  70     // We always push rbp, so that on return to interpreter rbp, will be
  71     // restored correctly and we can correct the stack.
  72     push(rbp);
  73     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  74     if (PreserveFramePointer) {
  75       mov(rbp, rsp);
  76     }
  77     // Remove word for ebp
  78     framesize -= wordSize;
  79 
  80     // Create frame
  81     if (framesize) {
  82       subptr(rsp, framesize);
  83     }
  84   } else {
  85     subptr(rsp, framesize);
  86 
  87     // Save RBP register now.
  88     framesize -= wordSize;
  89     movptr(Address(rsp, framesize), rbp);
  90     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  91     if (PreserveFramePointer) {
  92       movptr(rbp, rsp);
  93       if (framesize > 0) {
  94         addptr(rbp, framesize);
  95       }
  96     }
  97   }
  98 
  99   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 100     framesize -= wordSize;
 101     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 102   }
 103 
 104 #ifdef ASSERT
 105   if (VerifyStackAtCalls) {
 106     Label L;
 107     push(rax);
 108     mov(rax, rsp);
 109     andptr(rax, StackAlignmentInBytes-1);
 110     cmpptr(rax, StackAlignmentInBytes-wordSize);
 111     pop(rax);
 112     jcc(Assembler::equal, L);
 113     STOP("Stack is not properly aligned!");
 114     bind(L);
 115   }
 116 #endif
 117 
 118   if (!is_stub) {
 119     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 120     // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 121     Label dummy_slow_path;
 122     Label dummy_continuation;
 123     Label* slow_path = &dummy_slow_path;
 124     Label* continuation = &dummy_continuation;
 125     if (!Compile::current()->output()->in_scratch_emit_size()) {
 126       // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 127       C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 128       Compile::current()->output()->add_stub(stub);
 129       slow_path = &stub->entry();
 130       continuation = &stub->continuation();
 131     }
 132     bs->nmethod_entry_barrier(this, slow_path, continuation);
 133   }
 134 }
 135 
 136 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 137   switch (vlen_in_bytes) {
 138     case  4: // fall-through
 139     case  8: // fall-through
 140     case 16: return Assembler::AVX_128bit;
 141     case 32: return Assembler::AVX_256bit;
 142     case 64: return Assembler::AVX_512bit;
 143 
 144     default: {
 145       ShouldNotReachHere();
 146       return Assembler::AVX_NoVec;
 147     }
 148   }
 149 }
 150 
 151 // fast_lock and fast_unlock used by C2
 152 
 153 // Because the transitions from emitted code to the runtime
 154 // monitorenter/exit helper stubs are so slow it's critical that
 155 // we inline both the lock-stack fast path and the inflated fast path.
 156 //
 157 // See also: cmpFastLock and cmpFastUnlock.
 158 //
 159 // What follows is a specialized inline transliteration of the code
 160 // in enter() and exit(). If we're concerned about I$ bloat another
 161 // option would be to emit TrySlowEnter and TrySlowExit methods
 162 // at startup-time.  These methods would accept arguments as
 163 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 164 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 165 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 166 // In practice, however, the # of lock sites is bounded and is usually small.
 167 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 168 // if the processor uses simple bimodal branch predictors keyed by EIP
 169 // Since the helper routines would be called from multiple synchronization
 170 // sites.
 171 //
 172 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 173 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 174 // to those specialized methods.  That'd give us a mostly platform-independent
 175 // implementation that the JITs could optimize and inline at their pleasure.
 176 // Done correctly, the only time we'd need to cross to native could would be
 177 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 178 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 179 // (b) explicit barriers or fence operations.
 180 //
 181 // TODO:
 182 //
 183 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 184 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 185 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 186 //    the lock operators would typically be faster than reifying Self.
 187 //
 188 // *  Ideally I'd define the primitives as:
 189 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 190 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 191 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 192 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 193 //    Furthermore the register assignments are overconstrained, possibly resulting in
 194 //    sub-optimal code near the synchronization site.
 195 //
 196 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 197 //    Alternately, use a better sp-proximity test.
 198 //
 199 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 200 //    Either one is sufficient to uniquely identify a thread.
 201 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 202 //
 203 // *  Intrinsify notify() and notifyAll() for the common cases where the
 204 //    object is locked by the calling thread but the waitlist is empty.
 205 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 206 //
 207 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 208 //    But beware of excessive branch density on AMD Opterons.
 209 //
 210 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 211 //    or failure of the fast path.  If the fast path fails then we pass
 212 //    control to the slow path, typically in C.  In fast_lock and
 213 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 214 //    will emit a conditional branch immediately after the node.
 215 //    So we have branches to branches and lots of ICC.ZF games.
 216 //    Instead, it might be better to have C2 pass a "FailureLabel"
 217 //    into fast_lock and fast_unlock.  In the case of success, control
 218 //    will drop through the node.  ICC.ZF is undefined at exit.
 219 //    In the case of failure, the node will branch directly to the
 220 //    FailureLabel
 221 
 222 // obj: object to lock
 223 // box: on-stack box address -- KILLED
 224 // rax: tmp -- KILLED
 225 // t  : tmp -- KILLED
 226 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register rax_reg,
 227                                   Register t, Register thread) {
 228   assert(rax_reg == rax, "Used for CAS");
 229   assert_different_registers(obj, box, rax_reg, t, thread);
 230 
 231   // Handle inflated monitor.
 232   Label inflated;
 233   // Finish fast lock successfully. ZF value is irrelevant.
 234   Label locked;
 235   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 236   Label slow_path;
 237 
 238   if (UseObjectMonitorTable) {
 239     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 240     movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
 241   }
 242 
 243   if (DiagnoseSyncOnValueBasedClasses != 0) {
 244     load_klass(rax_reg, obj, t);
 245     testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 246     jcc(Assembler::notZero, slow_path);
 247   }
 248 
 249   const Register mark = t;
 250 
 251   { // Fast Lock
 252 
 253     Label push;
 254 
 255     const Register top = UseObjectMonitorTable ? rax_reg : box;
 256 
 257     // Load the mark.
 258     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 259 
 260     // Prefetch top.
 261     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 262 
 263     // Check for monitor (0b10).
 264     testptr(mark, markWord::monitor_value);
 265     jcc(Assembler::notZero, inflated);
 266 
 267     // Check if lock-stack is full.
 268     cmpl(top, LockStack::end_offset() - 1);
 269     jcc(Assembler::greater, slow_path);
 270 
 271     // Check if recursive.
 272     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 273     jccb(Assembler::equal, push);
 274 
 275     // Try to lock. Transition lock bits 0b01 => 0b00
 276     movptr(rax_reg, mark);
 277     orptr(rax_reg, markWord::unlocked_value);
 278     andptr(mark, ~(int32_t)markWord::unlocked_value);
 279     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 280     jcc(Assembler::notEqual, slow_path);
 281 
 282     if (UseObjectMonitorTable) {
 283       // Need to reload top, clobbered by CAS.
 284       movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 285     }
 286     bind(push);
 287     // After successful lock, push object on lock-stack.
 288     movptr(Address(thread, top), obj);
 289     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 290     jmp(locked);
 291   }
 292 
 293   { // Handle inflated monitor.
 294     bind(inflated);
 295 
 296     const Register monitor = t;
 297 
 298     if (!UseObjectMonitorTable) {
 299       assert(mark == monitor, "should be the same here");
 300     } else {
 301       const Register hash = t;
 302       Label monitor_found;
 303 
 304       // Look for the monitor in the om_cache.
 305 
 306       ByteSize cache_offset   = JavaThread::om_cache_oops_offset();
 307       ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
 308       const int num_unrolled  = OMCache::CAPACITY;
 309       for (int i = 0; i < num_unrolled; i++) {
 310         movptr(monitor, Address(thread,  cache_offset + monitor_offset));
 311         cmpptr(obj, Address(thread, cache_offset));
 312         jccb(Assembler::equal, monitor_found);
 313         cache_offset = cache_offset + OMCache::oop_to_oop_difference();
 314       }
 315 
 316       // Look for the monitor in the table.
 317 
 318       // Get the hash code.
 319       movptr(hash, Address(obj, oopDesc::mark_offset_in_bytes()));
 320       shrq(hash, markWord::hash_shift);
 321       andq(hash, markWord::hash_mask);
 322 
 323       // Get the table and calculate the bucket's address.
 324       lea(rax_reg, ExternalAddress(ObjectMonitorTable::current_table_address()));
 325       movptr(rax_reg, Address(rax_reg));
 326       andq(hash, Address(rax_reg, ObjectMonitorTable::table_capacity_mask_offset()));
 327       movptr(rax_reg, Address(rax_reg, ObjectMonitorTable::table_buckets_offset()));
 328 
 329       // Read the monitor from the bucket.
 330       movptr(monitor, Address(rax_reg, hash, Address::times_ptr));
 331 
 332       // Check if the monitor in the bucket is special (empty, tombstone or removed)
 333       cmpptr(monitor, ObjectMonitorTable::SpecialPointerValues::below_is_special);
 334       jcc(Assembler::below, slow_path);
 335 
 336       // Check if object matches.
 337       movptr(rax_reg, Address(monitor, ObjectMonitor::object_offset()));
 338       BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 339       bs_asm->try_resolve_weak_handle_in_c2(this, rax_reg, slow_path);
 340       cmpptr(rax_reg, obj);
 341       jcc(Assembler::notEqual, slow_path);
 342 
 343       bind(monitor_found);
 344     }
 345     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 346     const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 347     const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
 348 
 349     Label monitor_locked;
 350     // Lock the monitor.
 351 
 352     if (UseObjectMonitorTable) {
 353       // Cache the monitor for unlock before trashing box. On failure to acquire
 354       // the lock, the slow path will reset the entry accordingly (see CacheSetter).
 355       movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
 356     }
 357 
 358     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 359     xorptr(rax_reg, rax_reg);
 360     movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
 361     lock(); cmpxchgptr(box, owner_address);
 362     jccb(Assembler::equal, monitor_locked);
 363 
 364     // Check if recursive.
 365     cmpptr(box, rax_reg);
 366     jccb(Assembler::notEqual, slow_path);
 367 
 368     // Recursive.
 369     increment(recursions_address);
 370 
 371     bind(monitor_locked);
 372   }
 373 
 374   bind(locked);
 375   // Set ZF = 1
 376   xorl(rax_reg, rax_reg);
 377 
 378 #ifdef ASSERT
 379   // Check that locked label is reached with ZF set.
 380   Label zf_correct;
 381   Label zf_bad_zero;
 382   jcc(Assembler::zero, zf_correct);
 383   jmp(zf_bad_zero);
 384 #endif
 385 
 386   bind(slow_path);
 387 #ifdef ASSERT
 388   // Check that slow_path label is reached with ZF not set.
 389   jcc(Assembler::notZero, zf_correct);
 390   stop("Fast Lock ZF != 0");
 391   bind(zf_bad_zero);
 392   stop("Fast Lock ZF != 1");
 393   bind(zf_correct);
 394 #endif
 395   // C2 uses the value of ZF to determine the continuation.
 396 }
 397 
 398 // obj: object to lock
 399 // rax: tmp -- KILLED
 400 // t  : tmp - cannot be obj nor rax -- KILLED
 401 //
 402 // Some commentary on balanced locking:
 403 //
 404 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 405 // Methods that don't have provably balanced locking are forced to run in the
 406 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 407 // The interpreter provides two properties:
 408 // I1:  At return-time the interpreter automatically and quietly unlocks any
 409 //      objects acquired in the current activation (frame).  Recall that the
 410 //      interpreter maintains an on-stack list of locks currently held by
 411 //      a frame.
 412 // I2:  If a method attempts to unlock an object that is not held by the
 413 //      frame the interpreter throws IMSX.
 414 //
 415 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 416 // B() doesn't have provably balanced locking so it runs in the interpreter.
 417 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 418 // is still locked by A().
 419 //
 420 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface
 421 // Specification" states that an object locked by JNI's MonitorEnter should not be
 422 // unlocked by "normal" java-level locking and vice-versa.  The specification doesn't
 423 // specify what will occur if a program engages in such mixed-mode locking, however.
 424 // Arguably given that the spec legislates the JNI case as undefined our implementation
 425 // could reasonably *avoid* checking owner in fast_unlock().
 426 // In the interest of performance we elide m->Owner==Self check in unlock.
 427 // A perfectly viable alternative is to elide the owner check except when
 428 // Xcheck:jni is enabled.
 429 
 430 void C2_MacroAssembler::fast_unlock(Register obj, Register reg_rax, Register t, Register thread) {
 431   assert(reg_rax == rax, "Used for CAS");
 432   assert_different_registers(obj, reg_rax, t);
 433 
 434   // Handle inflated monitor.
 435   Label inflated, inflated_check_lock_stack;
 436   // Finish fast unlock successfully.  MUST jump with ZF == 1
 437   Label unlocked, slow_path;
 438 
 439   const Register mark = t;
 440   const Register monitor = t;
 441   const Register top = UseObjectMonitorTable ? t : reg_rax;
 442   const Register box = reg_rax;
 443 
 444   Label dummy;
 445   C2FastUnlockStub* stub = nullptr;
 446 
 447   if (!Compile::current()->output()->in_scratch_emit_size()) {
 448     stub = new (Compile::current()->comp_arena()) C2FastUnlockStub(obj, mark, reg_rax, thread);
 449     Compile::current()->output()->add_stub(stub);
 450   }
 451 
 452   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 453 
 454   { // Fast Unlock
 455 
 456     // Load top.
 457     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 458 
 459     if (!UseObjectMonitorTable) {
 460       // Prefetch mark.
 461       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 462     }
 463 
 464     // Check if obj is top of lock-stack.
 465     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 466     // Top of lock stack was not obj. Must be monitor.
 467     jcc(Assembler::notEqual, inflated_check_lock_stack);
 468 
 469     // Pop lock-stack.
 470     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 471     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 472 
 473     // Check if recursive.
 474     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 475     jcc(Assembler::equal, unlocked);
 476 
 477     // We elide the monitor check, let the CAS fail instead.
 478 
 479     if (UseObjectMonitorTable) {
 480       // Load mark.
 481       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 482     }
 483 
 484     // Try to unlock. Transition lock bits 0b00 => 0b01
 485     movptr(reg_rax, mark);
 486     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
 487     orptr(mark, markWord::unlocked_value);
 488     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 489     jcc(Assembler::notEqual, push_and_slow_path);
 490     jmp(unlocked);
 491   }
 492 
 493 
 494   { // Handle inflated monitor.
 495     bind(inflated_check_lock_stack);
 496 #ifdef ASSERT
 497     Label check_done;
 498     subl(top, oopSize);
 499     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
 500     jcc(Assembler::below, check_done);
 501     cmpptr(obj, Address(thread, top));
 502     jcc(Assembler::notEqual, inflated_check_lock_stack);
 503     stop("Fast Unlock lock on stack");
 504     bind(check_done);
 505     if (UseObjectMonitorTable) {
 506       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 507     }
 508     testptr(mark, markWord::monitor_value);
 509     jcc(Assembler::notZero, inflated);
 510     stop("Fast Unlock not monitor");
 511 #endif
 512 
 513     bind(inflated);
 514 
 515     if (!UseObjectMonitorTable) {
 516       assert(mark == monitor, "should be the same here");
 517     } else {
 518       // Uses ObjectMonitorTable.  Look for the monitor in our BasicLock on the stack.
 519       movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 520       // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
 521       cmpptr(monitor, alignof(ObjectMonitor*));
 522       jcc(Assembler::below, slow_path);
 523     }
 524     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 525     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
 526     const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
 527     const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag};
 528     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
 529 
 530     Label recursive;
 531 
 532     // Check if recursive.
 533     cmpptr(recursions_address, 0);
 534     jcc(Assembler::notZero, recursive);
 535 
 536     // Set owner to null.
 537     // Release to satisfy the JMM
 538     movptr(owner_address, NULL_WORD);
 539     // We need a full fence after clearing owner to avoid stranding.
 540     // StoreLoad achieves this.
 541     membar(StoreLoad);
 542 
 543     // Check if the entry_list is empty.
 544     cmpptr(entry_list_address, NULL_WORD);
 545     jcc(Assembler::zero, unlocked);    // If so we are done.
 546 
 547     // Check if there is a successor.
 548     cmpptr(succ_address, NULL_WORD);
 549     jcc(Assembler::notZero, unlocked); // If so we are done.
 550 
 551     // Save the monitor pointer in the current thread, so we can try to
 552     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 553     if (!UseObjectMonitorTable) {
 554       andptr(monitor, ~(int32_t)markWord::monitor_value);
 555     }
 556     movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
 557 
 558     orl(t, 1); // Fast Unlock ZF = 0
 559     jmpb(slow_path);
 560 
 561     // Recursive unlock.
 562     bind(recursive);
 563     decrement(recursions_address);
 564   }
 565 
 566   bind(unlocked);
 567   xorl(t, t); // Fast Unlock ZF = 1
 568 
 569 #ifdef ASSERT
 570   // Check that unlocked label is reached with ZF set.
 571   Label zf_correct;
 572   Label zf_bad_zero;
 573   jcc(Assembler::zero, zf_correct);
 574   jmp(zf_bad_zero);
 575 #endif
 576 
 577   bind(slow_path);
 578   if (stub != nullptr) {
 579     bind(stub->slow_path_continuation());
 580   }
 581 #ifdef ASSERT
 582   // Check that stub->continuation() label is reached with ZF not set.
 583   jcc(Assembler::notZero, zf_correct);
 584   stop("Fast Unlock ZF != 0");
 585   bind(zf_bad_zero);
 586   stop("Fast Unlock ZF != 1");
 587   bind(zf_correct);
 588 #endif
 589   // C2 uses the value of ZF to determine the continuation.
 590 }
 591 
 592 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
 593   fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
 594 }
 595 
 596 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) {
 597   const int framesize = Compile::current()->output()->frame_size_in_bytes();
 598   masm->movptr(dst, rsp);
 599   if (framesize > 2 * wordSize) {
 600     masm->addptr(dst, framesize - 2 * wordSize);
 601   }
 602 }
 603 
 604 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
 605   if (PreserveFramePointer) {
 606     // frame pointer is valid
 607 #ifdef ASSERT
 608     // Verify frame pointer value in rbp.
 609     reconstruct_frame_pointer_helper(this, rtmp);
 610     Label L_success;
 611     cmpq(rbp, rtmp);
 612     jccb(Assembler::equal, L_success);
 613     STOP("frame pointer mismatch");
 614     bind(L_success);
 615 #endif // ASSERT
 616   } else {
 617     reconstruct_frame_pointer_helper(this, rbp);
 618   }
 619 }
 620 
 621 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) {
 622   jint lo = t->_lo;
 623   jint hi = t->_hi;
 624   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi);
 625   if (t == TypeInt::INT) {
 626     return;
 627   }
 628 
 629   BLOCK_COMMENT("CastII {");
 630   Label fail;
 631   Label succeed;
 632 
 633   if (lo != min_jint) {
 634     cmpl(val, lo);
 635     jccb(Assembler::less, fail);
 636   }
 637   if (hi != max_jint) {
 638     cmpl(val, hi);
 639     jccb(Assembler::greater, fail);
 640   }
 641   jmpb(succeed);
 642 
 643   bind(fail);
 644   movl(c_rarg0, idx);
 645   movl(c_rarg1, val);
 646   movl(c_rarg2, lo);
 647   movl(c_rarg3, hi);
 648   reconstruct_frame_pointer(rscratch1);
 649   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range)));
 650   hlt();
 651   bind(succeed);
 652   BLOCK_COMMENT("} // CastII");
 653 }
 654 
 655 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
 656   fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
 657 }
 658 
 659 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) {
 660   jlong lo = t->_lo;
 661   jlong hi = t->_hi;
 662   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi);
 663   if (t == TypeLong::LONG) {
 664     return;
 665   }
 666 
 667   BLOCK_COMMENT("CastLL {");
 668   Label fail;
 669   Label succeed;
 670 
 671   auto cmp_val = [&](jlong bound) {
 672     if (is_simm32(bound)) {
 673       cmpq(val, checked_cast<int>(bound));
 674     } else {
 675       mov64(tmp, bound);
 676       cmpq(val, tmp);
 677     }
 678   };
 679 
 680   if (lo != min_jlong) {
 681     cmp_val(lo);
 682     jccb(Assembler::less, fail);
 683   }
 684   if (hi != max_jlong) {
 685     cmp_val(hi);
 686     jccb(Assembler::greater, fail);
 687   }
 688   jmpb(succeed);
 689 
 690   bind(fail);
 691   movl(c_rarg0, idx);
 692   movq(c_rarg1, val);
 693   mov64(c_rarg2, lo);
 694   mov64(c_rarg3, hi);
 695   reconstruct_frame_pointer(rscratch1);
 696   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range)));
 697   hlt();
 698   bind(succeed);
 699   BLOCK_COMMENT("} // CastLL");
 700 }
 701 
 702 //-------------------------------------------------------------------------------------------
 703 // Generic instructions support for use in .ad files C2 code generation
 704 
 705 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 706   if (dst != src) {
 707     movdqu(dst, src);
 708   }
 709   if (opcode == Op_AbsVD) {
 710     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 711   } else {
 712     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 713     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 714   }
 715 }
 716 
 717 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 718   if (opcode == Op_AbsVD) {
 719     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 720   } else {
 721     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 722     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 723   }
 724 }
 725 
 726 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 727   if (dst != src) {
 728     movdqu(dst, src);
 729   }
 730   if (opcode == Op_AbsVF) {
 731     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 732   } else {
 733     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 734     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 735   }
 736 }
 737 
 738 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 739   if (opcode == Op_AbsVF) {
 740     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 741   } else {
 742     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 743     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 744   }
 745 }
 746 
 747 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 748   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 749   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 750 
 751   if (opcode == Op_MinV) {
 752     if (elem_bt == T_BYTE) {
 753       pminsb(dst, src);
 754     } else if (elem_bt == T_SHORT) {
 755       pminsw(dst, src);
 756     } else if (elem_bt == T_INT) {
 757       pminsd(dst, src);
 758     } else {
 759       assert(elem_bt == T_LONG, "required");
 760       assert(tmp == xmm0, "required");
 761       assert_different_registers(dst, src, tmp);
 762       movdqu(xmm0, dst);
 763       pcmpgtq(xmm0, src);
 764       blendvpd(dst, src);  // xmm0 as mask
 765     }
 766   } else { // opcode == Op_MaxV
 767     if (elem_bt == T_BYTE) {
 768       pmaxsb(dst, src);
 769     } else if (elem_bt == T_SHORT) {
 770       pmaxsw(dst, src);
 771     } else if (elem_bt == T_INT) {
 772       pmaxsd(dst, src);
 773     } else {
 774       assert(elem_bt == T_LONG, "required");
 775       assert(tmp == xmm0, "required");
 776       assert_different_registers(dst, src, tmp);
 777       movdqu(xmm0, src);
 778       pcmpgtq(xmm0, dst);
 779       blendvpd(dst, src);  // xmm0 as mask
 780     }
 781   }
 782 }
 783 
 784 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 785                                   XMMRegister src1, Address src2, int vlen_enc) {
 786   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 787   if (opcode == Op_UMinV) {
 788     switch(elem_bt) {
 789       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 790       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 791       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 792       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 793       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 794     }
 795   } else {
 796     assert(opcode == Op_UMaxV, "required");
 797     switch(elem_bt) {
 798       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 799       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 800       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 801       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 802       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 803     }
 804   }
 805 }
 806 
 807 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
 808   // For optimality, leverage a full vector width of 512 bits
 809   // for operations over smaller vector sizes on AVX512 targets.
 810   if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 811     if (opcode == Op_UMaxV) {
 812       evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 813     } else {
 814       assert(opcode == Op_UMinV, "required");
 815       evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 816     }
 817   } else {
 818     // T1 = -1
 819     vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
 820     // T1 = -1 << 63
 821     vpsllq(xtmp1, xtmp1, 63, vlen_enc);
 822     // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
 823     vpaddq(xtmp2, xtmp1, src2, vlen_enc);
 824     // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
 825     vpaddq(xtmp1, xtmp1, src1, vlen_enc);
 826     // Mask = T2 > T1
 827     vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
 828     if (opcode == Op_UMaxV) {
 829       // Res = Mask ? Src2 : Src1
 830       vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
 831     } else {
 832       // Res = Mask ? Src1 : Src2
 833       vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
 834     }
 835   }
 836 }
 837 
 838 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 839                                   XMMRegister src1, XMMRegister src2, int vlen_enc) {
 840   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 841   if (opcode == Op_UMinV) {
 842     switch(elem_bt) {
 843       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 844       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 845       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 846       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 847       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 848     }
 849   } else {
 850     assert(opcode == Op_UMaxV, "required");
 851     switch(elem_bt) {
 852       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 853       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 854       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 855       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 856       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 857     }
 858   }
 859 }
 860 
 861 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 862                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 863                                  int vlen_enc) {
 864   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 865 
 866   if (opcode == Op_MinV) {
 867     if (elem_bt == T_BYTE) {
 868       vpminsb(dst, src1, src2, vlen_enc);
 869     } else if (elem_bt == T_SHORT) {
 870       vpminsw(dst, src1, src2, vlen_enc);
 871     } else if (elem_bt == T_INT) {
 872       vpminsd(dst, src1, src2, vlen_enc);
 873     } else {
 874       assert(elem_bt == T_LONG, "required");
 875       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 876         vpminsq(dst, src1, src2, vlen_enc);
 877       } else {
 878         assert_different_registers(dst, src1, src2);
 879         vpcmpgtq(dst, src1, src2, vlen_enc);
 880         vblendvpd(dst, src1, src2, dst, vlen_enc);
 881       }
 882     }
 883   } else { // opcode == Op_MaxV
 884     if (elem_bt == T_BYTE) {
 885       vpmaxsb(dst, src1, src2, vlen_enc);
 886     } else if (elem_bt == T_SHORT) {
 887       vpmaxsw(dst, src1, src2, vlen_enc);
 888     } else if (elem_bt == T_INT) {
 889       vpmaxsd(dst, src1, src2, vlen_enc);
 890     } else {
 891       assert(elem_bt == T_LONG, "required");
 892       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 893         vpmaxsq(dst, src1, src2, vlen_enc);
 894       } else {
 895         assert_different_registers(dst, src1, src2);
 896         vpcmpgtq(dst, src1, src2, vlen_enc);
 897         vblendvpd(dst, src2, src1, dst, vlen_enc);
 898       }
 899     }
 900   }
 901 }
 902 
 903 // Float/Double min max
 904 
 905 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
 906                                    XMMRegister dst, XMMRegister a, XMMRegister b,
 907                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
 908                                    int vlen_enc) {
 909   assert(UseAVX > 0, "required");
 910   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 911          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 912   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 913   assert_different_registers(a, tmp, atmp, btmp);
 914   assert_different_registers(b, tmp, atmp, btmp);
 915 
 916   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 917   bool is_double_word = is_double_word_type(elem_bt);
 918 
 919   /* Note on 'non-obvious' assembly sequence:
 920    *
 921    * While there are vminps/vmaxps instructions, there are two important differences between hardware
 922    * and Java on how they handle floats:
 923    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
 924    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
 925    *
 926    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
 927    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
 928    *                (only useful when signs differ, noop otherwise)
 929    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
 930 
 931    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
 932    *   btmp = (b < +0.0) ? a : b
 933    *   atmp = (b < +0.0) ? b : a
 934    *   Tmp  = Max_Float(atmp , btmp)
 935    *   Res  = (atmp == NaN) ? atmp : Tmp
 936    */
 937 
 938   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
 939   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
 940   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
 941   XMMRegister mask;
 942 
 943   if (!is_double_word && is_min) {
 944     mask = a;
 945     vblend = &MacroAssembler::vblendvps;
 946     vmaxmin = &MacroAssembler::vminps;
 947     vcmp = &MacroAssembler::vcmpps;
 948   } else if (!is_double_word && !is_min) {
 949     mask = b;
 950     vblend = &MacroAssembler::vblendvps;
 951     vmaxmin = &MacroAssembler::vmaxps;
 952     vcmp = &MacroAssembler::vcmpps;
 953   } else if (is_double_word && is_min) {
 954     mask = a;
 955     vblend = &MacroAssembler::vblendvpd;
 956     vmaxmin = &MacroAssembler::vminpd;
 957     vcmp = &MacroAssembler::vcmppd;
 958   } else {
 959     assert(is_double_word && !is_min, "sanity");
 960     mask = b;
 961     vblend = &MacroAssembler::vblendvpd;
 962     vmaxmin = &MacroAssembler::vmaxpd;
 963     vcmp = &MacroAssembler::vcmppd;
 964   }
 965 
 966   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
 967   XMMRegister maxmin, scratch;
 968   if (dst == btmp) {
 969     maxmin = btmp;
 970     scratch = tmp;
 971   } else {
 972     maxmin = tmp;
 973     scratch = btmp;
 974   }
 975 
 976   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
 977   if (precompute_mask && !is_double_word) {
 978     vpsrad(tmp, mask, 32, vlen_enc);
 979     mask = tmp;
 980   } else if (precompute_mask && is_double_word) {
 981     vpxor(tmp, tmp, tmp, vlen_enc);
 982     vpcmpgtq(tmp, tmp, mask, vlen_enc);
 983     mask = tmp;
 984   }
 985 
 986   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
 987   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
 988   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
 989   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 990   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
 991 }
 992 
 993 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
 994                                     XMMRegister dst, XMMRegister a, XMMRegister b,
 995                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
 996                                     int vlen_enc) {
 997   assert(UseAVX > 2, "required");
 998   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 999          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1000   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1001   assert_different_registers(dst, a, atmp, btmp);
1002   assert_different_registers(dst, b, atmp, btmp);
1003 
1004   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1005   bool is_double_word = is_double_word_type(elem_bt);
1006   bool merge = true;
1007 
1008   if (!is_double_word && is_min) {
1009     evpmovd2m(ktmp, a, vlen_enc);
1010     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1011     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1012     vminps(dst, atmp, btmp, vlen_enc);
1013     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1014     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1015   } else if (!is_double_word && !is_min) {
1016     evpmovd2m(ktmp, b, vlen_enc);
1017     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1018     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1019     vmaxps(dst, atmp, btmp, vlen_enc);
1020     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1021     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1022   } else if (is_double_word && is_min) {
1023     evpmovq2m(ktmp, a, vlen_enc);
1024     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1025     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1026     vminpd(dst, atmp, btmp, vlen_enc);
1027     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1028     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1029   } else {
1030     assert(is_double_word && !is_min, "sanity");
1031     evpmovq2m(ktmp, b, vlen_enc);
1032     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1033     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1034     vmaxpd(dst, atmp, btmp, vlen_enc);
1035     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1036     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1037   }
1038 }
1039 
1040 void C2_MacroAssembler::vminmax_fp_avx10_2(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1041                                            XMMRegister src1, XMMRegister src2, int vlen_enc) {
1042   assert(opc == Op_MinV || opc == Op_MinReductionV ||
1043          opc == Op_MaxV || opc == Op_MaxReductionV, "sanity");
1044 
1045   int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_2_MINMAX_MIN_COMPARE_SIGN
1046                                                          : AVX10_2_MINMAX_MAX_COMPARE_SIGN;
1047   if (elem_bt == T_FLOAT) {
1048     evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc);
1049   } else {
1050     assert(elem_bt == T_DOUBLE, "");
1051     evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc);
1052   }
1053 }
1054 
1055 void C2_MacroAssembler::sminmax_fp_avx10_2(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1056                                            XMMRegister src1, XMMRegister src2) {
1057   assert(opc == Op_MinF || opc == Op_MaxF ||
1058          opc == Op_MinD || opc == Op_MaxD, "sanity");
1059 
1060   int imm8 = (opc == Op_MinF || opc == Op_MinD) ? AVX10_2_MINMAX_MIN_COMPARE_SIGN
1061                                                 : AVX10_2_MINMAX_MAX_COMPARE_SIGN;
1062   if (elem_bt == T_FLOAT) {
1063     evminmaxss(dst, mask, src1, src2, true, imm8);
1064   } else {
1065     assert(elem_bt == T_DOUBLE, "");
1066     evminmaxsd(dst, mask, src1, src2, true, imm8);
1067   }
1068 }
1069 
1070 // Float/Double signum
1071 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1072   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1073 
1074   Label DONE_LABEL;
1075 
1076   // Handle special cases +0.0/-0.0 and NaN, if argument is +0.0/-0.0 or NaN, return argument
1077   // If AVX10.2 (or newer) floating point comparison instructions used, SF=1 for equal and unordered cases
1078   // If other floating point comparison instructions used, ZF=1 for equal and unordered cases
1079   if (opcode == Op_SignumF) {
1080     if (VM_Version::supports_avx10_2()) {
1081       evucomxss(dst, zero);
1082       jcc(Assembler::negative, DONE_LABEL);
1083     } else {
1084       ucomiss(dst, zero);
1085       jcc(Assembler::equal, DONE_LABEL);
1086     }
1087     movflt(dst, one);
1088     jcc(Assembler::above, DONE_LABEL);
1089     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1090   } else if (opcode == Op_SignumD) {
1091     if (VM_Version::supports_avx10_2()) {
1092       evucomxsd(dst, zero);
1093       jcc(Assembler::negative, DONE_LABEL);
1094     } else {
1095       ucomisd(dst, zero);
1096       jcc(Assembler::equal, DONE_LABEL);
1097     }
1098     movdbl(dst, one);
1099     jcc(Assembler::above, DONE_LABEL);
1100     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1101   }
1102 
1103   bind(DONE_LABEL);
1104 }
1105 
1106 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1107   if (sign) {
1108     pmovsxbw(dst, src);
1109   } else {
1110     pmovzxbw(dst, src);
1111   }
1112 }
1113 
1114 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1115   if (sign) {
1116     vpmovsxbw(dst, src, vector_len);
1117   } else {
1118     vpmovzxbw(dst, src, vector_len);
1119   }
1120 }
1121 
1122 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1123   if (sign) {
1124     vpmovsxbd(dst, src, vector_len);
1125   } else {
1126     vpmovzxbd(dst, src, vector_len);
1127   }
1128 }
1129 
1130 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1131   if (sign) {
1132     vpmovsxwd(dst, src, vector_len);
1133   } else {
1134     vpmovzxwd(dst, src, vector_len);
1135   }
1136 }
1137 
1138 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1139                                      int shift, int vector_len) {
1140   if (opcode == Op_RotateLeftV) {
1141     if (etype == T_INT) {
1142       evprold(dst, src, shift, vector_len);
1143     } else {
1144       assert(etype == T_LONG, "expected type T_LONG");
1145       evprolq(dst, src, shift, vector_len);
1146     }
1147   } else {
1148     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1149     if (etype == T_INT) {
1150       evprord(dst, src, shift, vector_len);
1151     } else {
1152       assert(etype == T_LONG, "expected type T_LONG");
1153       evprorq(dst, src, shift, vector_len);
1154     }
1155   }
1156 }
1157 
1158 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1159                                      XMMRegister shift, int vector_len) {
1160   if (opcode == Op_RotateLeftV) {
1161     if (etype == T_INT) {
1162       evprolvd(dst, src, shift, vector_len);
1163     } else {
1164       assert(etype == T_LONG, "expected type T_LONG");
1165       evprolvq(dst, src, shift, vector_len);
1166     }
1167   } else {
1168     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1169     if (etype == T_INT) {
1170       evprorvd(dst, src, shift, vector_len);
1171     } else {
1172       assert(etype == T_LONG, "expected type T_LONG");
1173       evprorvq(dst, src, shift, vector_len);
1174     }
1175   }
1176 }
1177 
1178 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1179   if (opcode == Op_RShiftVI) {
1180     psrad(dst, shift);
1181   } else if (opcode == Op_LShiftVI) {
1182     pslld(dst, shift);
1183   } else {
1184     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1185     psrld(dst, shift);
1186   }
1187 }
1188 
1189 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1190   switch (opcode) {
1191     case Op_RShiftVI:  psrad(dst, shift); break;
1192     case Op_LShiftVI:  pslld(dst, shift); break;
1193     case Op_URShiftVI: psrld(dst, shift); break;
1194 
1195     default: assert(false, "%s", NodeClassNames[opcode]);
1196   }
1197 }
1198 
1199 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1200   if (opcode == Op_RShiftVI) {
1201     vpsrad(dst, nds, shift, vector_len);
1202   } else if (opcode == Op_LShiftVI) {
1203     vpslld(dst, nds, shift, vector_len);
1204   } else {
1205     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1206     vpsrld(dst, nds, shift, vector_len);
1207   }
1208 }
1209 
1210 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1211   switch (opcode) {
1212     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1213     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1214     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1215 
1216     default: assert(false, "%s", NodeClassNames[opcode]);
1217   }
1218 }
1219 
1220 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1221   switch (opcode) {
1222     case Op_RShiftVB:  // fall-through
1223     case Op_RShiftVS:  psraw(dst, shift); break;
1224 
1225     case Op_LShiftVB:  // fall-through
1226     case Op_LShiftVS:  psllw(dst, shift);   break;
1227 
1228     case Op_URShiftVS: // fall-through
1229     case Op_URShiftVB: psrlw(dst, shift);  break;
1230 
1231     default: assert(false, "%s", NodeClassNames[opcode]);
1232   }
1233 }
1234 
1235 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1236   switch (opcode) {
1237     case Op_RShiftVB:  // fall-through
1238     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1239 
1240     case Op_LShiftVB:  // fall-through
1241     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1242 
1243     case Op_URShiftVS: // fall-through
1244     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1245 
1246     default: assert(false, "%s", NodeClassNames[opcode]);
1247   }
1248 }
1249 
1250 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1251   switch (opcode) {
1252     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1253     case Op_LShiftVL:  psllq(dst, shift); break;
1254     case Op_URShiftVL: psrlq(dst, shift); break;
1255 
1256     default: assert(false, "%s", NodeClassNames[opcode]);
1257   }
1258 }
1259 
1260 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1261   if (opcode == Op_RShiftVL) {
1262     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1263   } else if (opcode == Op_LShiftVL) {
1264     psllq(dst, shift);
1265   } else {
1266     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1267     psrlq(dst, shift);
1268   }
1269 }
1270 
1271 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1272   switch (opcode) {
1273     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1274     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1275     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1276 
1277     default: assert(false, "%s", NodeClassNames[opcode]);
1278   }
1279 }
1280 
1281 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1282   if (opcode == Op_RShiftVL) {
1283     evpsraq(dst, nds, shift, vector_len);
1284   } else if (opcode == Op_LShiftVL) {
1285     vpsllq(dst, nds, shift, vector_len);
1286   } else {
1287     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1288     vpsrlq(dst, nds, shift, vector_len);
1289   }
1290 }
1291 
1292 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1293   switch (opcode) {
1294     case Op_RShiftVB:  // fall-through
1295     case Op_RShiftVS:  // fall-through
1296     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1297 
1298     case Op_LShiftVB:  // fall-through
1299     case Op_LShiftVS:  // fall-through
1300     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1301 
1302     case Op_URShiftVB: // fall-through
1303     case Op_URShiftVS: // fall-through
1304     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1305 
1306     default: assert(false, "%s", NodeClassNames[opcode]);
1307   }
1308 }
1309 
1310 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1311   switch (opcode) {
1312     case Op_RShiftVB:  // fall-through
1313     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1314 
1315     case Op_LShiftVB:  // fall-through
1316     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1317 
1318     case Op_URShiftVB: // fall-through
1319     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1320 
1321     default: assert(false, "%s", NodeClassNames[opcode]);
1322   }
1323 }
1324 
1325 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1326   assert(UseAVX >= 2, "required");
1327   switch (opcode) {
1328     case Op_RShiftVL: {
1329       if (UseAVX > 2) {
1330         assert(tmp == xnoreg, "not used");
1331         if (!VM_Version::supports_avx512vl()) {
1332           vlen_enc = Assembler::AVX_512bit;
1333         }
1334         evpsravq(dst, src, shift, vlen_enc);
1335       } else {
1336         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1337         vpsrlvq(dst, src, shift, vlen_enc);
1338         vpsrlvq(tmp, tmp, shift, vlen_enc);
1339         vpxor(dst, dst, tmp, vlen_enc);
1340         vpsubq(dst, dst, tmp, vlen_enc);
1341       }
1342       break;
1343     }
1344     case Op_LShiftVL: {
1345       assert(tmp == xnoreg, "not used");
1346       vpsllvq(dst, src, shift, vlen_enc);
1347       break;
1348     }
1349     case Op_URShiftVL: {
1350       assert(tmp == xnoreg, "not used");
1351       vpsrlvq(dst, src, shift, vlen_enc);
1352       break;
1353     }
1354     default: assert(false, "%s", NodeClassNames[opcode]);
1355   }
1356 }
1357 
1358 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1359 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1360   assert(opcode == Op_LShiftVB ||
1361          opcode == Op_RShiftVB ||
1362          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1363   bool sign = (opcode != Op_URShiftVB);
1364   assert(vector_len == 0, "required");
1365   vextendbd(sign, dst, src, 1);
1366   vpmovzxbd(vtmp, shift, 1);
1367   varshiftd(opcode, dst, dst, vtmp, 1);
1368   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1369   vextracti128_high(vtmp, dst);
1370   vpackusdw(dst, dst, vtmp, 0);
1371 }
1372 
1373 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1374 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1375   assert(opcode == Op_LShiftVB ||
1376          opcode == Op_RShiftVB ||
1377          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1378   bool sign = (opcode != Op_URShiftVB);
1379   int ext_vector_len = vector_len + 1;
1380   vextendbw(sign, dst, src, ext_vector_len);
1381   vpmovzxbw(vtmp, shift, ext_vector_len);
1382   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1383   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1384   if (vector_len == 0) {
1385     vextracti128_high(vtmp, dst);
1386     vpackuswb(dst, dst, vtmp, vector_len);
1387   } else {
1388     vextracti64x4_high(vtmp, dst);
1389     vpackuswb(dst, dst, vtmp, vector_len);
1390     vpermq(dst, dst, 0xD8, vector_len);
1391   }
1392 }
1393 
1394 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1395   switch(typ) {
1396     case T_BYTE:
1397       pinsrb(dst, val, idx);
1398       break;
1399     case T_SHORT:
1400       pinsrw(dst, val, idx);
1401       break;
1402     case T_INT:
1403       pinsrd(dst, val, idx);
1404       break;
1405     case T_LONG:
1406       pinsrq(dst, val, idx);
1407       break;
1408     default:
1409       assert(false,"Should not reach here.");
1410       break;
1411   }
1412 }
1413 
1414 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1415   switch(typ) {
1416     case T_BYTE:
1417       vpinsrb(dst, src, val, idx);
1418       break;
1419     case T_SHORT:
1420       vpinsrw(dst, src, val, idx);
1421       break;
1422     case T_INT:
1423       vpinsrd(dst, src, val, idx);
1424       break;
1425     case T_LONG:
1426       vpinsrq(dst, src, val, idx);
1427       break;
1428     default:
1429       assert(false,"Should not reach here.");
1430       break;
1431   }
1432 }
1433 
1434 void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst,
1435                                          Register base, Register idx_base,
1436                                          Register mask, Register mask_idx,
1437                                          Register rtmp, int vlen_enc) {
1438   vpxor(dst, dst, dst, vlen_enc);
1439   if (elem_bt == T_SHORT) {
1440     for (int i = 0; i < 4; i++) {
1441       // dst[i] = mask[i] ? src[idx_base[i]] : 0
1442       Label skip_load;
1443       btq(mask, mask_idx);
1444       jccb(Assembler::carryClear, skip_load);
1445       movl(rtmp, Address(idx_base, i * 4));
1446       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1447       bind(skip_load);
1448       incq(mask_idx);
1449     }
1450   } else {
1451     assert(elem_bt == T_BYTE, "");
1452     for (int i = 0; i < 8; i++) {
1453       // dst[i] = mask[i] ? src[idx_base[i]] : 0
1454       Label skip_load;
1455       btq(mask, mask_idx);
1456       jccb(Assembler::carryClear, skip_load);
1457       movl(rtmp, Address(idx_base, i * 4));
1458       pinsrb(dst, Address(base, rtmp), i);
1459       bind(skip_load);
1460       incq(mask_idx);
1461     }
1462   }
1463 }
1464 
1465 void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst,
1466                                   Register base, Register idx_base,
1467                                   Register rtmp, int vlen_enc) {
1468   vpxor(dst, dst, dst, vlen_enc);
1469   if (elem_bt == T_SHORT) {
1470     for (int i = 0; i < 4; i++) {
1471       // dst[i] = src[idx_base[i]]
1472       movl(rtmp, Address(idx_base, i * 4));
1473       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1474     }
1475   } else {
1476     assert(elem_bt == T_BYTE, "");
1477     for (int i = 0; i < 8; i++) {
1478       // dst[i] = src[idx_base[i]]
1479       movl(rtmp, Address(idx_base, i * 4));
1480       pinsrb(dst, Address(base, rtmp), i);
1481     }
1482   }
1483 }
1484 
1485 /*
1486  * Gather using hybrid algorithm, first partially unroll scalar loop
1487  * to accumulate values from gather indices into a quad-word(64bit) slice.
1488  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1489  * permutation to place the slice into appropriate vector lane
1490  * locations in destination vector. Following pseudo code describes the
1491  * algorithm in detail:
1492  *
1493  * DST_VEC = ZERO_VEC
1494  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1495  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1496  * FOREACH_ITER:
1497  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1498  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1499  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1500  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1501  *
1502  * With each iteration, doubleword permute indices (0,1) corresponding
1503  * to gathered quadword gets right shifted by two lane positions.
1504  *
1505  */
1506 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1507                                         Register base, Register idx_base,
1508                                         Register mask, XMMRegister xtmp1,
1509                                         XMMRegister xtmp2, XMMRegister temp_dst,
1510                                         Register rtmp, Register mask_idx,
1511                                         Register length, int vector_len, int vlen_enc) {
1512   Label GATHER8_LOOP;
1513   assert(is_subword_type(elem_ty), "");
1514   movl(length, vector_len);
1515   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1516   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1517   vallones(xtmp2, vlen_enc);
1518   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1519   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1520   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1521 
1522   bind(GATHER8_LOOP);
1523     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1524     if (mask == noreg) {
1525       vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc);
1526     } else {
1527       vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc);
1528     }
1529     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1530     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1531     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1532     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1533     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1534     vpor(dst, dst, temp_dst, vlen_enc);
1535     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1536     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1537     jcc(Assembler::notEqual, GATHER8_LOOP);
1538 }
1539 
1540 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1541   switch(typ) {
1542     case T_INT:
1543       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1544       break;
1545     case T_FLOAT:
1546       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1547       break;
1548     case T_LONG:
1549       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1550       break;
1551     case T_DOUBLE:
1552       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1553       break;
1554     default:
1555       assert(false,"Should not reach here.");
1556       break;
1557   }
1558 }
1559 
1560 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1561   switch(typ) {
1562     case T_INT:
1563       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1564       break;
1565     case T_FLOAT:
1566       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1567       break;
1568     case T_LONG:
1569       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1570       break;
1571     case T_DOUBLE:
1572       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1573       break;
1574     default:
1575       assert(false,"Should not reach here.");
1576       break;
1577   }
1578 }
1579 
1580 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1581   switch(typ) {
1582     case T_INT:
1583       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1584       break;
1585     case T_FLOAT:
1586       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1587       break;
1588     case T_LONG:
1589       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1590       break;
1591     case T_DOUBLE:
1592       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1593       break;
1594     default:
1595       assert(false,"Should not reach here.");
1596       break;
1597   }
1598 }
1599 
1600 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1601   if (vlen_in_bytes <= 16) {
1602     pxor (dst, dst);
1603     psubb(dst, src);
1604     switch (elem_bt) {
1605       case T_BYTE:   /* nothing to do */ break;
1606       case T_SHORT:  pmovsxbw(dst, dst); break;
1607       case T_INT:    pmovsxbd(dst, dst); break;
1608       case T_FLOAT:  pmovsxbd(dst, dst); break;
1609       case T_LONG:   pmovsxbq(dst, dst); break;
1610       case T_DOUBLE: pmovsxbq(dst, dst); break;
1611 
1612       default: assert(false, "%s", type2name(elem_bt));
1613     }
1614   } else {
1615     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1616     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1617 
1618     vpxor (dst, dst, dst, vlen_enc);
1619     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1620 
1621     switch (elem_bt) {
1622       case T_BYTE:   /* nothing to do */            break;
1623       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1624       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1625       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1626       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1627       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1628 
1629       default: assert(false, "%s", type2name(elem_bt));
1630     }
1631   }
1632 }
1633 
1634 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1635   if (novlbwdq) {
1636     vpmovsxbd(xtmp, src, vlen_enc);
1637     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1638             Assembler::eq, true, vlen_enc, noreg);
1639   } else {
1640     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1641     vpsubb(xtmp, xtmp, src, vlen_enc);
1642     evpmovb2m(dst, xtmp, vlen_enc);
1643   }
1644 }
1645 
1646 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1647   if (is_integral_type(bt)) {
1648     switch (vlen_in_bytes) {
1649       case 4:  movdl(dst, src);   break;
1650       case 8:  movq(dst, src);    break;
1651       case 16: movdqu(dst, src);  break;
1652       case 32: vmovdqu(dst, src); break;
1653       case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1654       default: ShouldNotReachHere();
1655     }
1656   } else {
1657     switch (vlen_in_bytes) {
1658       case 4:  movflt(dst, src); break;
1659       case 8:  movdbl(dst, src); break;
1660       case 16: movups(dst, src); break;
1661       case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1662       case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1663       default: ShouldNotReachHere();
1664     }
1665   }
1666 }
1667 
1668 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1669   assert(rscratch != noreg || always_reachable(src), "missing");
1670 
1671   if (reachable(src)) {
1672     load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1673   } else {
1674     lea(rscratch, src);
1675     load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1676   }
1677 }
1678 
1679 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1680   int vlen_enc = vector_length_encoding(vlen);
1681   if (VM_Version::supports_avx()) {
1682     if (bt == T_LONG) {
1683       if (VM_Version::supports_avx2()) {
1684         vpbroadcastq(dst, src, vlen_enc);
1685       } else {
1686         vmovddup(dst, src, vlen_enc);
1687       }
1688     } else if (bt == T_DOUBLE) {
1689       if (vlen_enc != Assembler::AVX_128bit) {
1690         vbroadcastsd(dst, src, vlen_enc, noreg);
1691       } else {
1692         vmovddup(dst, src, vlen_enc);
1693       }
1694     } else {
1695       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1696         vpbroadcastd(dst, src, vlen_enc);
1697       } else {
1698         vbroadcastss(dst, src, vlen_enc);
1699       }
1700     }
1701   } else if (VM_Version::supports_sse3()) {
1702     movddup(dst, src);
1703   } else {
1704     load_vector(bt, dst, src, vlen);
1705   }
1706 }
1707 
1708 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1709   int entry_idx = vector_iota_entry_index(bt);
1710   ExternalAddress addr(StubRoutines::x86::vector_iota_indices(entry_idx));
1711   load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1712 }
1713 
1714 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1715 
1716 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1717   int vector_len = Assembler::AVX_128bit;
1718 
1719   switch (opcode) {
1720     case Op_AndReductionV:  pand(dst, src); break;
1721     case Op_OrReductionV:   por (dst, src); break;
1722     case Op_XorReductionV:  pxor(dst, src); break;
1723     case Op_MinReductionV:
1724       switch (typ) {
1725         case T_BYTE:        pminsb(dst, src); break;
1726         case T_SHORT:       pminsw(dst, src); break;
1727         case T_INT:         pminsd(dst, src); break;
1728         case T_LONG:        assert(UseAVX > 2, "required");
1729                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1730         default:            assert(false, "wrong type");
1731       }
1732       break;
1733     case Op_MaxReductionV:
1734       switch (typ) {
1735         case T_BYTE:        pmaxsb(dst, src); break;
1736         case T_SHORT:       pmaxsw(dst, src); break;
1737         case T_INT:         pmaxsd(dst, src); break;
1738         case T_LONG:        assert(UseAVX > 2, "required");
1739                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1740         default:            assert(false, "wrong type");
1741       }
1742       break;
1743     case Op_UMinReductionV:
1744       switch (typ) {
1745         case T_BYTE:        vpminub(dst, dst, src, Assembler::AVX_128bit); break;
1746         case T_SHORT:       vpminuw(dst, dst, src, Assembler::AVX_128bit); break;
1747         case T_INT:         vpminud(dst, dst, src, Assembler::AVX_128bit); break;
1748         case T_LONG:        evpminuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break;
1749         default:            assert(false, "wrong type");
1750       }
1751       break;
1752     case Op_UMaxReductionV:
1753       switch (typ) {
1754         case T_BYTE:        vpmaxub(dst, dst, src, Assembler::AVX_128bit); break;
1755         case T_SHORT:       vpmaxuw(dst, dst, src, Assembler::AVX_128bit); break;
1756         case T_INT:         vpmaxud(dst, dst, src, Assembler::AVX_128bit); break;
1757         case T_LONG:        evpmaxuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break;
1758         default:            assert(false, "wrong type");
1759       }
1760       break;
1761     case Op_AddReductionVF: addss(dst, src); break;
1762     case Op_AddReductionVD: addsd(dst, src); break;
1763     case Op_AddReductionVI:
1764       switch (typ) {
1765         case T_BYTE:        paddb(dst, src); break;
1766         case T_SHORT:       paddw(dst, src); break;
1767         case T_INT:         paddd(dst, src); break;
1768         default:            assert(false, "wrong type");
1769       }
1770       break;
1771     case Op_AddReductionVL: paddq(dst, src); break;
1772     case Op_MulReductionVF: mulss(dst, src); break;
1773     case Op_MulReductionVD: mulsd(dst, src); break;
1774     case Op_MulReductionVI:
1775       switch (typ) {
1776         case T_SHORT:       pmullw(dst, src); break;
1777         case T_INT:         pmulld(dst, src); break;
1778         default:            assert(false, "wrong type");
1779       }
1780       break;
1781     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1782                             evpmullq(dst, dst, src, vector_len); break;
1783     default:                assert(false, "wrong opcode");
1784   }
1785 }
1786 
1787 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1788   switch (opcode) {
1789     case Op_AddReductionVF: addps(dst, src); break;
1790     case Op_AddReductionVD: addpd(dst, src); break;
1791     case Op_MulReductionVF: mulps(dst, src); break;
1792     case Op_MulReductionVD: mulpd(dst, src); break;
1793     default:                assert(false, "%s", NodeClassNames[opcode]);
1794   }
1795 }
1796 
1797 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1798   int vector_len = Assembler::AVX_256bit;
1799 
1800   switch (opcode) {
1801     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1802     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1803     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1804     case Op_MinReductionV:
1805       switch (typ) {
1806         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1807         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1808         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1809         case T_LONG:        assert(UseAVX > 2, "required");
1810                             vpminsq(dst, src1, src2, vector_len); break;
1811         default:            assert(false, "wrong type");
1812       }
1813       break;
1814     case Op_MaxReductionV:
1815       switch (typ) {
1816         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1817         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1818         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1819         case T_LONG:        assert(UseAVX > 2, "required");
1820                             vpmaxsq(dst, src1, src2, vector_len); break;
1821         default:            assert(false, "wrong type");
1822       }
1823       break;
1824     case Op_UMinReductionV:
1825       switch (typ) {
1826         case T_BYTE:        vpminub(dst, src1, src2, vector_len); break;
1827         case T_SHORT:       vpminuw(dst, src1, src2, vector_len); break;
1828         case T_INT:         vpminud(dst, src1, src2, vector_len); break;
1829         case T_LONG:        evpminuq(dst, k0, src1, src2, true, vector_len); break;
1830         default:            assert(false, "wrong type");
1831       }
1832       break;
1833     case Op_UMaxReductionV:
1834       switch (typ) {
1835         case T_BYTE:        vpmaxub(dst, src1, src2, vector_len); break;
1836         case T_SHORT:       vpmaxuw(dst, src1, src2, vector_len); break;
1837         case T_INT:         vpmaxud(dst, src1, src2, vector_len); break;
1838         case T_LONG:        evpmaxuq(dst, k0, src1, src2, true, vector_len); break;
1839         default:            assert(false, "wrong type");
1840       }
1841       break;
1842     case Op_AddReductionVI:
1843       switch (typ) {
1844         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1845         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1846         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1847         default:            assert(false, "wrong type");
1848       }
1849       break;
1850     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1851     case Op_MulReductionVI:
1852       switch (typ) {
1853         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1854         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1855         default:            assert(false, "wrong type");
1856       }
1857       break;
1858     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1859     default:                assert(false, "wrong opcode");
1860   }
1861 }
1862 
1863 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1864   int vector_len = Assembler::AVX_256bit;
1865 
1866   switch (opcode) {
1867     case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1868     case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1869     case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1870     case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1871     default:                assert(false, "%s", NodeClassNames[opcode]);
1872   }
1873 }
1874 
1875 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1876                                   XMMRegister dst, XMMRegister src,
1877                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1878   switch (opcode) {
1879     case Op_AddReductionVF:
1880     case Op_MulReductionVF:
1881       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1882       break;
1883 
1884     case Op_AddReductionVD:
1885     case Op_MulReductionVD:
1886       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1887       break;
1888 
1889     default: assert(false, "wrong opcode");
1890   }
1891 }
1892 
1893 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1894                                             XMMRegister dst, XMMRegister src,
1895                                             XMMRegister vtmp1, XMMRegister vtmp2) {
1896   switch (opcode) {
1897     case Op_AddReductionVF:
1898     case Op_MulReductionVF:
1899       unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1900       break;
1901 
1902     case Op_AddReductionVD:
1903     case Op_MulReductionVD:
1904       unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1905       break;
1906 
1907     default: assert(false, "%s", NodeClassNames[opcode]);
1908   }
1909 }
1910 
1911 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1912                              Register dst, Register src1, XMMRegister src2,
1913                              XMMRegister vtmp1, XMMRegister vtmp2) {
1914   switch (vlen) {
1915     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1916     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1917     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1918     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1919 
1920     default: assert(false, "wrong vector length");
1921   }
1922 }
1923 
1924 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1925                              Register dst, Register src1, XMMRegister src2,
1926                              XMMRegister vtmp1, XMMRegister vtmp2) {
1927   switch (vlen) {
1928     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1929     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1930     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1931     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1932 
1933     default: assert(false, "wrong vector length");
1934   }
1935 }
1936 
1937 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1938                              Register dst, Register src1, XMMRegister src2,
1939                              XMMRegister vtmp1, XMMRegister vtmp2) {
1940   switch (vlen) {
1941     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1942     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1943     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1944     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1945 
1946     default: assert(false, "wrong vector length");
1947   }
1948 }
1949 
1950 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1951                              Register dst, Register src1, XMMRegister src2,
1952                              XMMRegister vtmp1, XMMRegister vtmp2) {
1953   switch (vlen) {
1954     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1955     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1956     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1957     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1958 
1959     default: assert(false, "wrong vector length");
1960   }
1961 }
1962 
1963 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1964                              Register dst, Register src1, XMMRegister src2,
1965                              XMMRegister vtmp1, XMMRegister vtmp2) {
1966   switch (vlen) {
1967     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1968     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1969     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1970 
1971     default: assert(false, "wrong vector length");
1972   }
1973 }
1974 
1975 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1976   switch (vlen) {
1977     case 2:
1978       assert(vtmp2 == xnoreg, "");
1979       reduce2F(opcode, dst, src, vtmp1);
1980       break;
1981     case 4:
1982       assert(vtmp2 == xnoreg, "");
1983       reduce4F(opcode, dst, src, vtmp1);
1984       break;
1985     case 8:
1986       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1987       break;
1988     case 16:
1989       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1990       break;
1991     default: assert(false, "wrong vector length");
1992   }
1993 }
1994 
1995 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1996   switch (vlen) {
1997     case 2:
1998       assert(vtmp2 == xnoreg, "");
1999       reduce2D(opcode, dst, src, vtmp1);
2000       break;
2001     case 4:
2002       reduce4D(opcode, dst, src, vtmp1, vtmp2);
2003       break;
2004     case 8:
2005       reduce8D(opcode, dst, src, vtmp1, vtmp2);
2006       break;
2007     default: assert(false, "wrong vector length");
2008   }
2009 }
2010 
2011 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2012   switch (vlen) {
2013     case 2:
2014       assert(vtmp1 == xnoreg, "");
2015       assert(vtmp2 == xnoreg, "");
2016       unorderedReduce2F(opcode, dst, src);
2017       break;
2018     case 4:
2019       assert(vtmp2 == xnoreg, "");
2020       unorderedReduce4F(opcode, dst, src, vtmp1);
2021       break;
2022     case 8:
2023       unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
2024       break;
2025     case 16:
2026       unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
2027       break;
2028     default: assert(false, "wrong vector length");
2029   }
2030 }
2031 
2032 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2033   switch (vlen) {
2034     case 2:
2035       assert(vtmp1 == xnoreg, "");
2036       assert(vtmp2 == xnoreg, "");
2037       unorderedReduce2D(opcode, dst, src);
2038       break;
2039     case 4:
2040       assert(vtmp2 == xnoreg, "");
2041       unorderedReduce4D(opcode, dst, src, vtmp1);
2042       break;
2043     case 8:
2044       unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2045       break;
2046     default: assert(false, "wrong vector length");
2047   }
2048 }
2049 
2050 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2051   if (opcode == Op_AddReductionVI) {
2052     if (vtmp1 != src2) {
2053       movdqu(vtmp1, src2);
2054     }
2055     phaddd(vtmp1, vtmp1);
2056   } else {
2057     pshufd(vtmp1, src2, 0x1);
2058     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2059   }
2060   movdl(vtmp2, src1);
2061   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2062   movdl(dst, vtmp1);
2063 }
2064 
2065 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2066   if (opcode == Op_AddReductionVI) {
2067     if (vtmp1 != src2) {
2068       movdqu(vtmp1, src2);
2069     }
2070     phaddd(vtmp1, src2);
2071     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2072   } else {
2073     pshufd(vtmp2, src2, 0xE);
2074     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2075     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2076   }
2077 }
2078 
2079 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2080   if (opcode == Op_AddReductionVI) {
2081     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2082     vextracti128_high(vtmp2, vtmp1);
2083     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2084     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2085   } else {
2086     vextracti128_high(vtmp1, src2);
2087     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2088     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2089   }
2090 }
2091 
2092 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2093   vextracti64x4_high(vtmp2, src2);
2094   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2095   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2096 }
2097 
2098 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2099   pshufd(vtmp2, src2, 0x1);
2100   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2101   movdqu(vtmp1, vtmp2);
2102   psrldq(vtmp1, 2);
2103   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2104   movdqu(vtmp2, vtmp1);
2105   psrldq(vtmp2, 1);
2106   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2107   movdl(vtmp2, src1);
2108   if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) {
2109     pmovzxbd(vtmp1, vtmp1);
2110   } else {
2111     pmovsxbd(vtmp1, vtmp1);
2112   }
2113   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2114   pextrb(dst, vtmp1, 0x0);
2115   movsbl(dst, dst);
2116 }
2117 
2118 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2119   pshufd(vtmp1, src2, 0xE);
2120   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2121   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2122 }
2123 
2124 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2125   vextracti128_high(vtmp2, src2);
2126   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2127   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2128 }
2129 
2130 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2131   vextracti64x4_high(vtmp1, src2);
2132   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2133   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2134 }
2135 
2136 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2137   pmovsxbw(vtmp2, src2);
2138   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2139 }
2140 
2141 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2142   if (UseAVX > 1) {
2143     int vector_len = Assembler::AVX_256bit;
2144     vpmovsxbw(vtmp1, src2, vector_len);
2145     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2146   } else {
2147     pmovsxbw(vtmp2, src2);
2148     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2149     pshufd(vtmp2, src2, 0xe);
2150     pmovsxbw(vtmp2, vtmp2);
2151     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2152   }
2153 }
2154 
2155 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2156   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2157     int vector_len = Assembler::AVX_512bit;
2158     vpmovsxbw(vtmp1, src2, vector_len);
2159     reduce32S(opcode, dst, src1, vtmp1, vtmp2, vtmp1);
2160   } else {
2161     assert(UseAVX >= 2,"Should not reach here.");
2162     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2163     vextracti128_high(vtmp2, src2);
2164     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2165   }
2166 }
2167 
2168 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2169   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2170   vextracti64x4_high(vtmp2, src2);
2171   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2172 }
2173 
2174 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2175   if (opcode == Op_AddReductionVI) {
2176     if (vtmp1 != src2) {
2177       movdqu(vtmp1, src2);
2178     }
2179     phaddw(vtmp1, vtmp1);
2180     phaddw(vtmp1, vtmp1);
2181   } else {
2182     pshufd(vtmp2, src2, 0x1);
2183     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2184     movdqu(vtmp1, vtmp2);
2185     psrldq(vtmp1, 2);
2186     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2187   }
2188   movdl(vtmp2, src1);
2189   if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) {
2190     pmovzxwd(vtmp1, vtmp1);
2191   } else {
2192     pmovsxwd(vtmp1, vtmp1);
2193   }
2194   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2195   pextrw(dst, vtmp1, 0x0);
2196   movswl(dst, dst);
2197 }
2198 
2199 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2200   if (opcode == Op_AddReductionVI) {
2201     if (vtmp1 != src2) {
2202       movdqu(vtmp1, src2);
2203     }
2204     phaddw(vtmp1, src2);
2205   } else {
2206     assert_different_registers(src2, vtmp1);
2207     pshufd(vtmp1, src2, 0xE);
2208     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2209   }
2210   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2211 }
2212 
2213 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2214   if (opcode == Op_AddReductionVI) {
2215     int vector_len = Assembler::AVX_256bit;
2216     vphaddw(vtmp2, src2, src2, vector_len);
2217     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2218   } else {
2219     assert_different_registers(src2, vtmp2);
2220     vextracti128_high(vtmp2, src2);
2221     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2222   }
2223   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2224 }
2225 
2226 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2227   assert_different_registers(src2, vtmp1);
2228   int vector_len = Assembler::AVX_256bit;
2229   vextracti64x4_high(vtmp1, src2);
2230   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2231   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2232 }
2233 
2234 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2235   pshufd(vtmp2, src2, 0xE);
2236   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2237   movdq(vtmp1, src1);
2238   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2239   movdq(dst, vtmp1);
2240 }
2241 
2242 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2243   vextracti128_high(vtmp1, src2);
2244   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2245   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2246 }
2247 
2248 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2249   vextracti64x4_high(vtmp2, src2);
2250   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2251   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2252 }
2253 
2254 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2255   mov64(temp, -1L);
2256   bzhiq(temp, temp, len);
2257   kmovql(dst, temp);
2258 }
2259 
2260 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2261   reduce_operation_128(T_FLOAT, opcode, dst, src);
2262   pshufd(vtmp, src, 0x1);
2263   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2264 }
2265 
2266 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2267   reduce2F(opcode, dst, src, vtmp);
2268   pshufd(vtmp, src, 0x2);
2269   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2270   pshufd(vtmp, src, 0x3);
2271   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2272 }
2273 
2274 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2275   reduce4F(opcode, dst, src, vtmp2);
2276   vextractf128_high(vtmp2, src);
2277   reduce4F(opcode, dst, vtmp2, vtmp1);
2278 }
2279 
2280 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2281   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2282   vextracti64x4_high(vtmp1, src);
2283   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2284 }
2285 
2286 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2287   pshufd(dst, src, 0x1);
2288   reduce_operation_128(T_FLOAT, opcode, dst, src);
2289 }
2290 
2291 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2292   pshufd(vtmp, src, 0xE);
2293   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2294   unorderedReduce2F(opcode, dst, vtmp);
2295 }
2296 
2297 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2298   vextractf128_high(vtmp1, src);
2299   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2300   unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2301 }
2302 
2303 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2304   vextractf64x4_high(vtmp2, src);
2305   unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2306   unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2307 }
2308 
2309 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2310   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2311   pshufd(vtmp, src, 0xE);
2312   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2313 }
2314 
2315 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2316   reduce2D(opcode, dst, src, vtmp2);
2317   vextractf128_high(vtmp2, src);
2318   reduce2D(opcode, dst, vtmp2, vtmp1);
2319 }
2320 
2321 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2322   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2323   vextracti64x4_high(vtmp1, src);
2324   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2325 }
2326 
2327 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2328   pshufd(dst, src, 0xE);
2329   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2330 }
2331 
2332 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2333   vextractf128_high(vtmp, src);
2334   unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2335   unorderedReduce2D(opcode, dst, vtmp);
2336 }
2337 
2338 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2339   vextractf64x4_high(vtmp2, src);
2340   unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2341   unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2342 }
2343 
2344 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2345   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2346 }
2347 
2348 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2349   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2350 }
2351 
2352 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2353   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2354 }
2355 
2356 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2357                                  int vec_enc) {
2358   switch(elem_bt) {
2359     case T_INT:
2360     case T_FLOAT:
2361       vmaskmovps(dst, src, mask, vec_enc);
2362       break;
2363     case T_LONG:
2364     case T_DOUBLE:
2365       vmaskmovpd(dst, src, mask, vec_enc);
2366       break;
2367     default:
2368       fatal("Unsupported type %s", type2name(elem_bt));
2369       break;
2370   }
2371 }
2372 
2373 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2374                                  int vec_enc) {
2375   switch(elem_bt) {
2376     case T_INT:
2377     case T_FLOAT:
2378       vmaskmovps(dst, src, mask, vec_enc);
2379       break;
2380     case T_LONG:
2381     case T_DOUBLE:
2382       vmaskmovpd(dst, src, mask, vec_enc);
2383       break;
2384     default:
2385       fatal("Unsupported type %s", type2name(elem_bt));
2386       break;
2387   }
2388 }
2389 
2390 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2391                                           XMMRegister dst, XMMRegister src,
2392                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2393                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2394   const int permconst[] = {1, 14};
2395   XMMRegister wsrc = src;
2396   XMMRegister wdst = xmm_0;
2397   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2398 
2399   int vlen_enc = Assembler::AVX_128bit;
2400   if (vlen == 16) {
2401     vlen_enc = Assembler::AVX_256bit;
2402   }
2403 
2404   for (int i = log2(vlen) - 1; i >=0; i--) {
2405     if (i == 0 && !is_dst_valid) {
2406       wdst = dst;
2407     }
2408     if (i == 3) {
2409       vextracti64x4_high(wtmp, wsrc);
2410     } else if (i == 2) {
2411       vextracti128_high(wtmp, wsrc);
2412     } else { // i = [0,1]
2413       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2414     }
2415 
2416     if (VM_Version::supports_avx10_2()) {
2417       vminmax_fp_avx10_2(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc);
2418     } else {
2419       vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2420     }
2421     wsrc = wdst;
2422     vlen_enc = Assembler::AVX_128bit;
2423   }
2424   if (is_dst_valid) {
2425     if (VM_Version::supports_avx10_2()) {
2426       vminmax_fp_avx10_2(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit);
2427     } else {
2428       vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2429     }
2430   }
2431 }
2432 
2433 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2434                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2435                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2436   XMMRegister wsrc = src;
2437   XMMRegister wdst = xmm_0;
2438   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2439   int vlen_enc = Assembler::AVX_128bit;
2440   if (vlen == 8) {
2441     vlen_enc = Assembler::AVX_256bit;
2442   }
2443   for (int i = log2(vlen) - 1; i >=0; i--) {
2444     if (i == 0 && !is_dst_valid) {
2445       wdst = dst;
2446     }
2447     if (i == 1) {
2448       vextracti128_high(wtmp, wsrc);
2449     } else if (i == 2) {
2450       vextracti64x4_high(wtmp, wsrc);
2451     } else {
2452       assert(i == 0, "%d", i);
2453       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2454     }
2455 
2456     if (VM_Version::supports_avx10_2()) {
2457       vminmax_fp_avx10_2(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc);
2458     } else {
2459       vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2460     }
2461 
2462     wsrc = wdst;
2463     vlen_enc = Assembler::AVX_128bit;
2464   }
2465 
2466   if (is_dst_valid) {
2467     if (VM_Version::supports_avx10_2()) {
2468       vminmax_fp_avx10_2(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit);
2469     } else {
2470       vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2471     }
2472   }
2473 }
2474 
2475 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2476   switch (bt) {
2477     case T_BYTE:  pextrb(dst, src, idx); break;
2478     case T_SHORT: pextrw(dst, src, idx); break;
2479     case T_INT:   pextrd(dst, src, idx); break;
2480     case T_LONG:  pextrq(dst, src, idx); break;
2481 
2482     default:
2483       assert(false,"Should not reach here.");
2484       break;
2485   }
2486 }
2487 
2488 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2489   int esize =  type2aelembytes(typ);
2490   int elem_per_lane = 16/esize;
2491   int lane = elemindex / elem_per_lane;
2492   int eindex = elemindex % elem_per_lane;
2493 
2494   if (lane >= 2) {
2495     assert(UseAVX > 2, "required");
2496     vextractf32x4(dst, src, lane & 3);
2497     return dst;
2498   } else if (lane > 0) {
2499     assert(UseAVX > 0, "required");
2500     vextractf128(dst, src, lane);
2501     return dst;
2502   } else {
2503     return src;
2504   }
2505 }
2506 
2507 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2508   if (typ == T_BYTE) {
2509     movsbl(dst, dst);
2510   } else if (typ == T_SHORT) {
2511     movswl(dst, dst);
2512   }
2513 }
2514 
2515 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2516   int esize =  type2aelembytes(typ);
2517   int elem_per_lane = 16/esize;
2518   int eindex = elemindex % elem_per_lane;
2519   assert(is_integral_type(typ),"required");
2520 
2521   if (eindex == 0) {
2522     if (typ == T_LONG) {
2523       movq(dst, src);
2524     } else {
2525       movdl(dst, src);
2526       movsxl(typ, dst);
2527     }
2528   } else {
2529     extract(typ, dst, src, eindex);
2530     movsxl(typ, dst);
2531   }
2532 }
2533 
2534 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2535   int esize =  type2aelembytes(typ);
2536   int elem_per_lane = 16/esize;
2537   int eindex = elemindex % elem_per_lane;
2538   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2539 
2540   if (eindex == 0) {
2541     movq(dst, src);
2542   } else {
2543     if (typ == T_FLOAT) {
2544       if (UseAVX == 0) {
2545         movdqu(dst, src);
2546         shufps(dst, dst, eindex);
2547       } else {
2548         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2549       }
2550     } else {
2551       if (UseAVX == 0) {
2552         movdqu(dst, src);
2553         psrldq(dst, eindex*esize);
2554       } else {
2555         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2556       }
2557       movq(dst, dst);
2558     }
2559   }
2560   // Zero upper bits
2561   if (typ == T_FLOAT) {
2562     if (UseAVX == 0) {
2563       assert(vtmp != xnoreg, "required.");
2564       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2565       pand(dst, vtmp);
2566     } else {
2567       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2568     }
2569   }
2570 }
2571 
2572 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2573   switch(typ) {
2574     case T_BYTE:
2575     case T_BOOLEAN:
2576       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2577       break;
2578     case T_SHORT:
2579     case T_CHAR:
2580       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2581       break;
2582     case T_INT:
2583     case T_FLOAT:
2584       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2585       break;
2586     case T_LONG:
2587     case T_DOUBLE:
2588       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2589       break;
2590     default:
2591       assert(false,"Should not reach here.");
2592       break;
2593   }
2594 }
2595 
2596 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2597   assert(rscratch != noreg || always_reachable(src2), "missing");
2598 
2599   switch(typ) {
2600     case T_BOOLEAN:
2601     case T_BYTE:
2602       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2603       break;
2604     case T_CHAR:
2605     case T_SHORT:
2606       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2607       break;
2608     case T_INT:
2609     case T_FLOAT:
2610       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2611       break;
2612     case T_LONG:
2613     case T_DOUBLE:
2614       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2615       break;
2616     default:
2617       assert(false,"Should not reach here.");
2618       break;
2619   }
2620 }
2621 
2622 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2623   switch(typ) {
2624     case T_BYTE:
2625       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2626       break;
2627     case T_SHORT:
2628       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2629       break;
2630     case T_INT:
2631     case T_FLOAT:
2632       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2633       break;
2634     case T_LONG:
2635     case T_DOUBLE:
2636       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2637       break;
2638     default:
2639       assert(false,"Should not reach here.");
2640       break;
2641   }
2642 }
2643 
2644 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2645   assert(vlen_in_bytes <= 32, "");
2646   int esize = type2aelembytes(bt);
2647   if (vlen_in_bytes == 32) {
2648     assert(vtmp == xnoreg, "required.");
2649     if (esize >= 4) {
2650       vtestps(src1, src2, AVX_256bit);
2651     } else {
2652       vptest(src1, src2, AVX_256bit);
2653     }
2654     return;
2655   }
2656   if (vlen_in_bytes < 16) {
2657     // Duplicate the lower part to fill the whole register,
2658     // Don't need to do so for src2
2659     assert(vtmp != xnoreg, "required");
2660     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2661     pshufd(vtmp, src1, shuffle_imm);
2662   } else {
2663     assert(vtmp == xnoreg, "required");
2664     vtmp = src1;
2665   }
2666   if (esize >= 4 && VM_Version::supports_avx()) {
2667     vtestps(vtmp, src2, AVX_128bit);
2668   } else {
2669     ptest(vtmp, src2);
2670   }
2671 }
2672 
2673 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2674 #ifdef ASSERT
2675   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2676   bool is_bw_supported = VM_Version::supports_avx512bw();
2677   if (is_bw && !is_bw_supported) {
2678     assert(vlen_enc != Assembler::AVX_512bit, "required");
2679     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2680            "XMM register should be 0-15");
2681   }
2682 #endif // ASSERT
2683   switch (elem_bt) {
2684     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2685     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2686     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2687     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2688     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2689     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2690     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2691   }
2692 }
2693 
2694 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2695   assert(UseAVX >= 2, "required");
2696   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2697   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2698   if ((UseAVX > 2) &&
2699       (!is_bw || VM_Version::supports_avx512bw()) &&
2700       (!is_vl || VM_Version::supports_avx512vl())) {
2701     switch (elem_bt) {
2702       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2703       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2704       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2705       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2706       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2707     }
2708   } else {
2709     assert(vlen_enc != Assembler::AVX_512bit, "required");
2710     assert((dst->encoding() < 16),"XMM register should be 0-15");
2711     switch (elem_bt) {
2712       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2713       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2714       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2715       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2716       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2717       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2718       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2719     }
2720   }
2721 }
2722 
2723 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2724   switch (to_elem_bt) {
2725     case T_SHORT:
2726       vpmovsxbw(dst, src, vlen_enc);
2727       break;
2728     case T_INT:
2729       vpmovsxbd(dst, src, vlen_enc);
2730       break;
2731     case T_FLOAT:
2732       vpmovsxbd(dst, src, vlen_enc);
2733       vcvtdq2ps(dst, dst, vlen_enc);
2734       break;
2735     case T_LONG:
2736       vpmovsxbq(dst, src, vlen_enc);
2737       break;
2738     case T_DOUBLE: {
2739       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2740       vpmovsxbd(dst, src, mid_vlen_enc);
2741       vcvtdq2pd(dst, dst, vlen_enc);
2742       break;
2743     }
2744     default:
2745       fatal("Unsupported type %s", type2name(to_elem_bt));
2746       break;
2747   }
2748 }
2749 
2750 //-------------------------------------------------------------------------------------------
2751 
2752 // IndexOf for constant substrings with size >= 8 chars
2753 // which don't need to be loaded through stack.
2754 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2755                                          Register cnt1, Register cnt2,
2756                                          int int_cnt2,  Register result,
2757                                          XMMRegister vec, Register tmp,
2758                                          int ae) {
2759   ShortBranchVerifier sbv(this);
2760   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2761   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2762 
2763   // This method uses the pcmpestri instruction with bound registers
2764   //   inputs:
2765   //     xmm - substring
2766   //     rax - substring length (elements count)
2767   //     mem - scanned string
2768   //     rdx - string length (elements count)
2769   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2770   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2771   //   outputs:
2772   //     rcx - matched index in string
2773   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2774   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2775   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2776   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2777   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2778 
2779   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2780         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2781         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2782 
2783   // Note, inline_string_indexOf() generates checks:
2784   // if (substr.count > string.count) return -1;
2785   // if (substr.count == 0) return 0;
2786   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2787 
2788   // Load substring.
2789   if (ae == StrIntrinsicNode::UL) {
2790     pmovzxbw(vec, Address(str2, 0));
2791   } else {
2792     movdqu(vec, Address(str2, 0));
2793   }
2794   movl(cnt2, int_cnt2);
2795   movptr(result, str1); // string addr
2796 
2797   if (int_cnt2 > stride) {
2798     jmpb(SCAN_TO_SUBSTR);
2799 
2800     // Reload substr for rescan, this code
2801     // is executed only for large substrings (> 8 chars)
2802     bind(RELOAD_SUBSTR);
2803     if (ae == StrIntrinsicNode::UL) {
2804       pmovzxbw(vec, Address(str2, 0));
2805     } else {
2806       movdqu(vec, Address(str2, 0));
2807     }
2808     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2809 
2810     bind(RELOAD_STR);
2811     // We came here after the beginning of the substring was
2812     // matched but the rest of it was not so we need to search
2813     // again. Start from the next element after the previous match.
2814 
2815     // cnt2 is number of substring reminding elements and
2816     // cnt1 is number of string reminding elements when cmp failed.
2817     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2818     subl(cnt1, cnt2);
2819     addl(cnt1, int_cnt2);
2820     movl(cnt2, int_cnt2); // Now restore cnt2
2821 
2822     decrementl(cnt1);     // Shift to next element
2823     cmpl(cnt1, cnt2);
2824     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2825 
2826     addptr(result, (1<<scale1));
2827 
2828   } // (int_cnt2 > 8)
2829 
2830   // Scan string for start of substr in 16-byte vectors
2831   bind(SCAN_TO_SUBSTR);
2832   pcmpestri(vec, Address(result, 0), mode);
2833   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2834   subl(cnt1, stride);
2835   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2836   cmpl(cnt1, cnt2);
2837   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2838   addptr(result, 16);
2839   jmpb(SCAN_TO_SUBSTR);
2840 
2841   // Found a potential substr
2842   bind(FOUND_CANDIDATE);
2843   // Matched whole vector if first element matched (tmp(rcx) == 0).
2844   if (int_cnt2 == stride) {
2845     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2846   } else { // int_cnt2 > 8
2847     jccb(Assembler::overflow, FOUND_SUBSTR);
2848   }
2849   // After pcmpestri tmp(rcx) contains matched element index
2850   // Compute start addr of substr
2851   lea(result, Address(result, tmp, scale1));
2852 
2853   // Make sure string is still long enough
2854   subl(cnt1, tmp);
2855   cmpl(cnt1, cnt2);
2856   if (int_cnt2 == stride) {
2857     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2858   } else { // int_cnt2 > 8
2859     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2860   }
2861   // Left less then substring.
2862 
2863   bind(RET_NOT_FOUND);
2864   movl(result, -1);
2865   jmp(EXIT);
2866 
2867   if (int_cnt2 > stride) {
2868     // This code is optimized for the case when whole substring
2869     // is matched if its head is matched.
2870     bind(MATCH_SUBSTR_HEAD);
2871     pcmpestri(vec, Address(result, 0), mode);
2872     // Reload only string if does not match
2873     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2874 
2875     Label CONT_SCAN_SUBSTR;
2876     // Compare the rest of substring (> 8 chars).
2877     bind(FOUND_SUBSTR);
2878     // First 8 chars are already matched.
2879     negptr(cnt2);
2880     addptr(cnt2, stride);
2881 
2882     bind(SCAN_SUBSTR);
2883     subl(cnt1, stride);
2884     cmpl(cnt2, -stride); // Do not read beyond substring
2885     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2886     // Back-up strings to avoid reading beyond substring:
2887     // cnt1 = cnt1 - cnt2 + 8
2888     addl(cnt1, cnt2); // cnt2 is negative
2889     addl(cnt1, stride);
2890     movl(cnt2, stride); negptr(cnt2);
2891     bind(CONT_SCAN_SUBSTR);
2892     if (int_cnt2 < (int)G) {
2893       int tail_off1 = int_cnt2<<scale1;
2894       int tail_off2 = int_cnt2<<scale2;
2895       if (ae == StrIntrinsicNode::UL) {
2896         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2897       } else {
2898         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2899       }
2900       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2901     } else {
2902       // calculate index in register to avoid integer overflow (int_cnt2*2)
2903       movl(tmp, int_cnt2);
2904       addptr(tmp, cnt2);
2905       if (ae == StrIntrinsicNode::UL) {
2906         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2907       } else {
2908         movdqu(vec, Address(str2, tmp, scale2, 0));
2909       }
2910       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2911     }
2912     // Need to reload strings pointers if not matched whole vector
2913     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2914     addptr(cnt2, stride);
2915     jcc(Assembler::negative, SCAN_SUBSTR);
2916     // Fall through if found full substring
2917 
2918   } // (int_cnt2 > 8)
2919 
2920   bind(RET_FOUND);
2921   // Found result if we matched full small substring.
2922   // Compute substr offset
2923   subptr(result, str1);
2924   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2925     shrl(result, 1); // index
2926   }
2927   bind(EXIT);
2928 
2929 } // string_indexofC8
2930 
2931 // Small strings are loaded through stack if they cross page boundary.
2932 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2933                                        Register cnt1, Register cnt2,
2934                                        int int_cnt2,  Register result,
2935                                        XMMRegister vec, Register tmp,
2936                                        int ae) {
2937   ShortBranchVerifier sbv(this);
2938   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2939   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2940 
2941   //
2942   // int_cnt2 is length of small (< 8 chars) constant substring
2943   // or (-1) for non constant substring in which case its length
2944   // is in cnt2 register.
2945   //
2946   // Note, inline_string_indexOf() generates checks:
2947   // if (substr.count > string.count) return -1;
2948   // if (substr.count == 0) return 0;
2949   //
2950   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2951   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2952   // This method uses the pcmpestri instruction with bound registers
2953   //   inputs:
2954   //     xmm - substring
2955   //     rax - substring length (elements count)
2956   //     mem - scanned string
2957   //     rdx - string length (elements count)
2958   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2959   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2960   //   outputs:
2961   //     rcx - matched index in string
2962   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2963   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2964   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2965   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2966 
2967   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2968         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2969         FOUND_CANDIDATE;
2970 
2971   { //========================================================
2972     // We don't know where these strings are located
2973     // and we can't read beyond them. Load them through stack.
2974     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2975 
2976     movptr(tmp, rsp); // save old SP
2977 
2978     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2979       if (int_cnt2 == (1>>scale2)) { // One byte
2980         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2981         load_unsigned_byte(result, Address(str2, 0));
2982         movdl(vec, result); // move 32 bits
2983       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2984         // Not enough header space in 32-bit VM: 12+3 = 15.
2985         movl(result, Address(str2, -1));
2986         shrl(result, 8);
2987         movdl(vec, result); // move 32 bits
2988       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2989         load_unsigned_short(result, Address(str2, 0));
2990         movdl(vec, result); // move 32 bits
2991       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2992         movdl(vec, Address(str2, 0)); // move 32 bits
2993       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2994         movq(vec, Address(str2, 0));  // move 64 bits
2995       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2996         // Array header size is 12 bytes in 32-bit VM
2997         // + 6 bytes for 3 chars == 18 bytes,
2998         // enough space to load vec and shift.
2999         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
3000         if (ae == StrIntrinsicNode::UL) {
3001           int tail_off = int_cnt2-8;
3002           pmovzxbw(vec, Address(str2, tail_off));
3003           psrldq(vec, -2*tail_off);
3004         }
3005         else {
3006           int tail_off = int_cnt2*(1<<scale2);
3007           movdqu(vec, Address(str2, tail_off-16));
3008           psrldq(vec, 16-tail_off);
3009         }
3010       }
3011     } else { // not constant substring
3012       cmpl(cnt2, stride);
3013       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3014 
3015       // We can read beyond string if srt+16 does not cross page boundary
3016       // since heaps are aligned and mapped by pages.
3017       assert(os::vm_page_size() < (int)G, "default page should be small");
3018       movl(result, str2); // We need only low 32 bits
3019       andl(result, ((int)os::vm_page_size()-1));
3020       cmpl(result, ((int)os::vm_page_size()-16));
3021       jccb(Assembler::belowEqual, CHECK_STR);
3022 
3023       // Move small strings to stack to allow load 16 bytes into vec.
3024       subptr(rsp, 16);
3025       int stk_offset = wordSize-(1<<scale2);
3026       push(cnt2);
3027 
3028       bind(COPY_SUBSTR);
3029       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3030         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3031         movb(Address(rsp, cnt2, scale2, stk_offset), result);
3032       } else if (ae == StrIntrinsicNode::UU) {
3033         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3034         movw(Address(rsp, cnt2, scale2, stk_offset), result);
3035       }
3036       decrement(cnt2);
3037       jccb(Assembler::notZero, COPY_SUBSTR);
3038 
3039       pop(cnt2);
3040       movptr(str2, rsp);  // New substring address
3041     } // non constant
3042 
3043     bind(CHECK_STR);
3044     cmpl(cnt1, stride);
3045     jccb(Assembler::aboveEqual, BIG_STRINGS);
3046 
3047     // Check cross page boundary.
3048     movl(result, str1); // We need only low 32 bits
3049     andl(result, ((int)os::vm_page_size()-1));
3050     cmpl(result, ((int)os::vm_page_size()-16));
3051     jccb(Assembler::belowEqual, BIG_STRINGS);
3052 
3053     subptr(rsp, 16);
3054     int stk_offset = -(1<<scale1);
3055     if (int_cnt2 < 0) { // not constant
3056       push(cnt2);
3057       stk_offset += wordSize;
3058     }
3059     movl(cnt2, cnt1);
3060 
3061     bind(COPY_STR);
3062     if (ae == StrIntrinsicNode::LL) {
3063       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3064       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3065     } else {
3066       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3067       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3068     }
3069     decrement(cnt2);
3070     jccb(Assembler::notZero, COPY_STR);
3071 
3072     if (int_cnt2 < 0) { // not constant
3073       pop(cnt2);
3074     }
3075     movptr(str1, rsp);  // New string address
3076 
3077     bind(BIG_STRINGS);
3078     // Load substring.
3079     if (int_cnt2 < 0) { // -1
3080       if (ae == StrIntrinsicNode::UL) {
3081         pmovzxbw(vec, Address(str2, 0));
3082       } else {
3083         movdqu(vec, Address(str2, 0));
3084       }
3085       push(cnt2);       // substr count
3086       push(str2);       // substr addr
3087       push(str1);       // string addr
3088     } else {
3089       // Small (< 8 chars) constant substrings are loaded already.
3090       movl(cnt2, int_cnt2);
3091     }
3092     push(tmp);  // original SP
3093 
3094   } // Finished loading
3095 
3096   //========================================================
3097   // Start search
3098   //
3099 
3100   movptr(result, str1); // string addr
3101 
3102   if (int_cnt2  < 0) {  // Only for non constant substring
3103     jmpb(SCAN_TO_SUBSTR);
3104 
3105     // SP saved at sp+0
3106     // String saved at sp+1*wordSize
3107     // Substr saved at sp+2*wordSize
3108     // Substr count saved at sp+3*wordSize
3109 
3110     // Reload substr for rescan, this code
3111     // is executed only for large substrings (> 8 chars)
3112     bind(RELOAD_SUBSTR);
3113     movptr(str2, Address(rsp, 2*wordSize));
3114     movl(cnt2, Address(rsp, 3*wordSize));
3115     if (ae == StrIntrinsicNode::UL) {
3116       pmovzxbw(vec, Address(str2, 0));
3117     } else {
3118       movdqu(vec, Address(str2, 0));
3119     }
3120     // We came here after the beginning of the substring was
3121     // matched but the rest of it was not so we need to search
3122     // again. Start from the next element after the previous match.
3123     subptr(str1, result); // Restore counter
3124     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3125       shrl(str1, 1);
3126     }
3127     addl(cnt1, str1);
3128     decrementl(cnt1);   // Shift to next element
3129     cmpl(cnt1, cnt2);
3130     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3131 
3132     addptr(result, (1<<scale1));
3133   } // non constant
3134 
3135   // Scan string for start of substr in 16-byte vectors
3136   bind(SCAN_TO_SUBSTR);
3137   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3138   pcmpestri(vec, Address(result, 0), mode);
3139   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3140   subl(cnt1, stride);
3141   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3142   cmpl(cnt1, cnt2);
3143   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3144   addptr(result, 16);
3145 
3146   bind(ADJUST_STR);
3147   cmpl(cnt1, stride); // Do not read beyond string
3148   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3149   // Back-up string to avoid reading beyond string.
3150   lea(result, Address(result, cnt1, scale1, -16));
3151   movl(cnt1, stride);
3152   jmpb(SCAN_TO_SUBSTR);
3153 
3154   // Found a potential substr
3155   bind(FOUND_CANDIDATE);
3156   // After pcmpestri tmp(rcx) contains matched element index
3157 
3158   // Make sure string is still long enough
3159   subl(cnt1, tmp);
3160   cmpl(cnt1, cnt2);
3161   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3162   // Left less then substring.
3163 
3164   bind(RET_NOT_FOUND);
3165   movl(result, -1);
3166   jmp(CLEANUP);
3167 
3168   bind(FOUND_SUBSTR);
3169   // Compute start addr of substr
3170   lea(result, Address(result, tmp, scale1));
3171   if (int_cnt2 > 0) { // Constant substring
3172     // Repeat search for small substring (< 8 chars)
3173     // from new point without reloading substring.
3174     // Have to check that we don't read beyond string.
3175     cmpl(tmp, stride-int_cnt2);
3176     jccb(Assembler::greater, ADJUST_STR);
3177     // Fall through if matched whole substring.
3178   } else { // non constant
3179     assert(int_cnt2 == -1, "should be != 0");
3180 
3181     addl(tmp, cnt2);
3182     // Found result if we matched whole substring.
3183     cmpl(tmp, stride);
3184     jcc(Assembler::lessEqual, RET_FOUND);
3185 
3186     // Repeat search for small substring (<= 8 chars)
3187     // from new point 'str1' without reloading substring.
3188     cmpl(cnt2, stride);
3189     // Have to check that we don't read beyond string.
3190     jccb(Assembler::lessEqual, ADJUST_STR);
3191 
3192     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3193     // Compare the rest of substring (> 8 chars).
3194     movptr(str1, result);
3195 
3196     cmpl(tmp, cnt2);
3197     // First 8 chars are already matched.
3198     jccb(Assembler::equal, CHECK_NEXT);
3199 
3200     bind(SCAN_SUBSTR);
3201     pcmpestri(vec, Address(str1, 0), mode);
3202     // Need to reload strings pointers if not matched whole vector
3203     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3204 
3205     bind(CHECK_NEXT);
3206     subl(cnt2, stride);
3207     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3208     addptr(str1, 16);
3209     if (ae == StrIntrinsicNode::UL) {
3210       addptr(str2, 8);
3211     } else {
3212       addptr(str2, 16);
3213     }
3214     subl(cnt1, stride);
3215     cmpl(cnt2, stride); // Do not read beyond substring
3216     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3217     // Back-up strings to avoid reading beyond substring.
3218 
3219     if (ae == StrIntrinsicNode::UL) {
3220       lea(str2, Address(str2, cnt2, scale2, -8));
3221       lea(str1, Address(str1, cnt2, scale1, -16));
3222     } else {
3223       lea(str2, Address(str2, cnt2, scale2, -16));
3224       lea(str1, Address(str1, cnt2, scale1, -16));
3225     }
3226     subl(cnt1, cnt2);
3227     movl(cnt2, stride);
3228     addl(cnt1, stride);
3229     bind(CONT_SCAN_SUBSTR);
3230     if (ae == StrIntrinsicNode::UL) {
3231       pmovzxbw(vec, Address(str2, 0));
3232     } else {
3233       movdqu(vec, Address(str2, 0));
3234     }
3235     jmp(SCAN_SUBSTR);
3236 
3237     bind(RET_FOUND_LONG);
3238     movptr(str1, Address(rsp, wordSize));
3239   } // non constant
3240 
3241   bind(RET_FOUND);
3242   // Compute substr offset
3243   subptr(result, str1);
3244   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3245     shrl(result, 1); // index
3246   }
3247   bind(CLEANUP);
3248   pop(rsp); // restore SP
3249 
3250 } // string_indexof
3251 
3252 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3253                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3254   ShortBranchVerifier sbv(this);
3255   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3256 
3257   int stride = 8;
3258 
3259   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3260         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3261         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3262         FOUND_SEQ_CHAR, DONE_LABEL;
3263 
3264   movptr(result, str1);
3265   if (UseAVX >= 2) {
3266     cmpl(cnt1, stride);
3267     jcc(Assembler::less, SCAN_TO_CHAR);
3268     cmpl(cnt1, 2*stride);
3269     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3270     movdl(vec1, ch);
3271     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3272     vpxor(vec2, vec2);
3273     movl(tmp, cnt1);
3274     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3275     andl(cnt1,0x0000000F);  //tail count (in chars)
3276 
3277     bind(SCAN_TO_16_CHAR_LOOP);
3278     vmovdqu(vec3, Address(result, 0));
3279     vpcmpeqw(vec3, vec3, vec1, 1);
3280     vptest(vec2, vec3);
3281     jcc(Assembler::carryClear, FOUND_CHAR);
3282     addptr(result, 32);
3283     subl(tmp, 2*stride);
3284     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3285     jmp(SCAN_TO_8_CHAR);
3286     bind(SCAN_TO_8_CHAR_INIT);
3287     movdl(vec1, ch);
3288     pshuflw(vec1, vec1, 0x00);
3289     pshufd(vec1, vec1, 0);
3290     pxor(vec2, vec2);
3291   }
3292   bind(SCAN_TO_8_CHAR);
3293   cmpl(cnt1, stride);
3294   jcc(Assembler::less, SCAN_TO_CHAR);
3295   if (UseAVX < 2) {
3296     movdl(vec1, ch);
3297     pshuflw(vec1, vec1, 0x00);
3298     pshufd(vec1, vec1, 0);
3299     pxor(vec2, vec2);
3300   }
3301   movl(tmp, cnt1);
3302   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3303   andl(cnt1,0x00000007);  //tail count (in chars)
3304 
3305   bind(SCAN_TO_8_CHAR_LOOP);
3306   movdqu(vec3, Address(result, 0));
3307   pcmpeqw(vec3, vec1);
3308   ptest(vec2, vec3);
3309   jcc(Assembler::carryClear, FOUND_CHAR);
3310   addptr(result, 16);
3311   subl(tmp, stride);
3312   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3313   bind(SCAN_TO_CHAR);
3314   testl(cnt1, cnt1);
3315   jcc(Assembler::zero, RET_NOT_FOUND);
3316   bind(SCAN_TO_CHAR_LOOP);
3317   load_unsigned_short(tmp, Address(result, 0));
3318   cmpl(ch, tmp);
3319   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3320   addptr(result, 2);
3321   subl(cnt1, 1);
3322   jccb(Assembler::zero, RET_NOT_FOUND);
3323   jmp(SCAN_TO_CHAR_LOOP);
3324 
3325   bind(RET_NOT_FOUND);
3326   movl(result, -1);
3327   jmpb(DONE_LABEL);
3328 
3329   bind(FOUND_CHAR);
3330   if (UseAVX >= 2) {
3331     vpmovmskb(tmp, vec3);
3332   } else {
3333     pmovmskb(tmp, vec3);
3334   }
3335   bsfl(ch, tmp);
3336   addptr(result, ch);
3337 
3338   bind(FOUND_SEQ_CHAR);
3339   subptr(result, str1);
3340   shrl(result, 1);
3341 
3342   bind(DONE_LABEL);
3343 } // string_indexof_char
3344 
3345 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3346                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3347   ShortBranchVerifier sbv(this);
3348   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3349 
3350   int stride = 16;
3351 
3352   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3353         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3354         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3355         FOUND_SEQ_CHAR, DONE_LABEL;
3356 
3357   movptr(result, str1);
3358   if (UseAVX >= 2) {
3359     cmpl(cnt1, stride);
3360     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3361     cmpl(cnt1, stride*2);
3362     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3363     movdl(vec1, ch);
3364     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3365     vpxor(vec2, vec2);
3366     movl(tmp, cnt1);
3367     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3368     andl(cnt1,0x0000001F);  //tail count (in chars)
3369 
3370     bind(SCAN_TO_32_CHAR_LOOP);
3371     vmovdqu(vec3, Address(result, 0));
3372     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3373     vptest(vec2, vec3);
3374     jcc(Assembler::carryClear, FOUND_CHAR);
3375     addptr(result, 32);
3376     subl(tmp, stride*2);
3377     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3378     jmp(SCAN_TO_16_CHAR);
3379 
3380     bind(SCAN_TO_16_CHAR_INIT);
3381     movdl(vec1, ch);
3382     pxor(vec2, vec2);
3383     pshufb(vec1, vec2);
3384   }
3385 
3386   bind(SCAN_TO_16_CHAR);
3387   cmpl(cnt1, stride);
3388   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3389   if (UseAVX < 2) {
3390     movdl(vec1, ch);
3391     pxor(vec2, vec2);
3392     pshufb(vec1, vec2);
3393   }
3394   movl(tmp, cnt1);
3395   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3396   andl(cnt1,0x0000000F);  //tail count (in bytes)
3397 
3398   bind(SCAN_TO_16_CHAR_LOOP);
3399   movdqu(vec3, Address(result, 0));
3400   pcmpeqb(vec3, vec1);
3401   ptest(vec2, vec3);
3402   jcc(Assembler::carryClear, FOUND_CHAR);
3403   addptr(result, 16);
3404   subl(tmp, stride);
3405   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3406 
3407   bind(SCAN_TO_CHAR_INIT);
3408   testl(cnt1, cnt1);
3409   jcc(Assembler::zero, RET_NOT_FOUND);
3410   bind(SCAN_TO_CHAR_LOOP);
3411   load_unsigned_byte(tmp, Address(result, 0));
3412   cmpl(ch, tmp);
3413   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3414   addptr(result, 1);
3415   subl(cnt1, 1);
3416   jccb(Assembler::zero, RET_NOT_FOUND);
3417   jmp(SCAN_TO_CHAR_LOOP);
3418 
3419   bind(RET_NOT_FOUND);
3420   movl(result, -1);
3421   jmpb(DONE_LABEL);
3422 
3423   bind(FOUND_CHAR);
3424   if (UseAVX >= 2) {
3425     vpmovmskb(tmp, vec3);
3426   } else {
3427     pmovmskb(tmp, vec3);
3428   }
3429   bsfl(ch, tmp);
3430   addptr(result, ch);
3431 
3432   bind(FOUND_SEQ_CHAR);
3433   subptr(result, str1);
3434 
3435   bind(DONE_LABEL);
3436 } // stringL_indexof_char
3437 
3438 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3439   switch (eltype) {
3440   case T_BOOLEAN: return sizeof(jboolean);
3441   case T_BYTE:  return sizeof(jbyte);
3442   case T_SHORT: return sizeof(jshort);
3443   case T_CHAR:  return sizeof(jchar);
3444   case T_INT:   return sizeof(jint);
3445   default:
3446     ShouldNotReachHere();
3447     return -1;
3448   }
3449 }
3450 
3451 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3452   switch (eltype) {
3453   // T_BOOLEAN used as surrogate for unsigned byte
3454   case T_BOOLEAN: movzbl(dst, src);   break;
3455   case T_BYTE:    movsbl(dst, src);   break;
3456   case T_SHORT:   movswl(dst, src);   break;
3457   case T_CHAR:    movzwl(dst, src);   break;
3458   case T_INT:     movl(dst, src);     break;
3459   default:
3460     ShouldNotReachHere();
3461   }
3462 }
3463 
3464 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3465   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3466 }
3467 
3468 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3469   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3470 }
3471 
3472 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3473   const int vlen = Assembler::AVX_256bit;
3474   switch (eltype) {
3475   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3476   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3477   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3478   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3479   case T_INT:
3480     // do nothing
3481     break;
3482   default:
3483     ShouldNotReachHere();
3484   }
3485 }
3486 
3487 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3488                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3489                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3490                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3491                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3492                                         BasicType eltype) {
3493   ShortBranchVerifier sbv(this);
3494   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3495   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3496   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3497 
3498   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3499         SHORT_UNROLLED_LOOP_EXIT,
3500         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3501         UNROLLED_VECTOR_LOOP_BEGIN,
3502         END;
3503   switch (eltype) {
3504   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3505   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3506   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3507   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3508   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3509   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3510   }
3511 
3512   // For "renaming" for readibility of the code
3513   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3514                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3515                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3516 
3517   const int elsize = arrays_hashcode_elsize(eltype);
3518 
3519   /*
3520     if (cnt1 >= 2) {
3521       if (cnt1 >= 32) {
3522         UNROLLED VECTOR LOOP
3523       }
3524       UNROLLED SCALAR LOOP
3525     }
3526     SINGLE SCALAR
3527    */
3528 
3529   cmpl(cnt1, 32);
3530   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3531 
3532   // cnt1 >= 32 && generate_vectorized_loop
3533   xorl(index, index);
3534 
3535   // vresult = IntVector.zero(I256);
3536   for (int idx = 0; idx < 4; idx++) {
3537     vpxor(vresult[idx], vresult[idx]);
3538   }
3539   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3540   Register bound = tmp2;
3541   Register next = tmp3;
3542   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3543   movl(next, Address(tmp2, 0));
3544   movdl(vnext, next);
3545   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3546 
3547   // index = 0;
3548   // bound = cnt1 & ~(32 - 1);
3549   movl(bound, cnt1);
3550   andl(bound, ~(32 - 1));
3551   // for (; index < bound; index += 32) {
3552   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3553   // result *= next;
3554   imull(result, next);
3555   // loop fission to upfront the cost of fetching from memory, OOO execution
3556   // can then hopefully do a better job of prefetching
3557   for (int idx = 0; idx < 4; idx++) {
3558     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3559   }
3560   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3561   for (int idx = 0; idx < 4; idx++) {
3562     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3563     arrays_hashcode_elvcast(vtmp[idx], eltype);
3564     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3565   }
3566   // index += 32;
3567   addl(index, 32);
3568   // index < bound;
3569   cmpl(index, bound);
3570   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3571   // }
3572 
3573   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3574   subl(cnt1, bound);
3575   // release bound
3576 
3577   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3578   for (int idx = 0; idx < 4; idx++) {
3579     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3580     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3581     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3582   }
3583   // result += vresult.reduceLanes(ADD);
3584   for (int idx = 0; idx < 4; idx++) {
3585     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3586   }
3587 
3588   // } else if (cnt1 < 32) {
3589 
3590   bind(SHORT_UNROLLED_BEGIN);
3591   // int i = 1;
3592   movl(index, 1);
3593   cmpl(index, cnt1);
3594   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3595 
3596   // for (; i < cnt1 ; i += 2) {
3597   bind(SHORT_UNROLLED_LOOP_BEGIN);
3598   movl(tmp3, 961);
3599   imull(result, tmp3);
3600   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3601   movl(tmp3, tmp2);
3602   shll(tmp3, 5);
3603   subl(tmp3, tmp2);
3604   addl(result, tmp3);
3605   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3606   addl(result, tmp3);
3607   addl(index, 2);
3608   cmpl(index, cnt1);
3609   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3610 
3611   // }
3612   // if (i >= cnt1) {
3613   bind(SHORT_UNROLLED_LOOP_EXIT);
3614   jccb(Assembler::greater, END);
3615   movl(tmp2, result);
3616   shll(result, 5);
3617   subl(result, tmp2);
3618   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3619   addl(result, tmp3);
3620   // }
3621   bind(END);
3622 
3623   BLOCK_COMMENT("} // arrays_hashcode");
3624 
3625 } // arrays_hashcode
3626 
3627 // helper function for string_compare
3628 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3629                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3630                                            Address::ScaleFactor scale2, Register index, int ae) {
3631   if (ae == StrIntrinsicNode::LL) {
3632     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3633     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3634   } else if (ae == StrIntrinsicNode::UU) {
3635     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3636     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3637   } else {
3638     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3639     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3640   }
3641 }
3642 
3643 // Compare strings, used for char[] and byte[].
3644 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3645                                        Register cnt1, Register cnt2, Register result,
3646                                        XMMRegister vec1, int ae, KRegister mask) {
3647   ShortBranchVerifier sbv(this);
3648   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3649   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only AVX3
3650   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3651   int stride2x2 = 0x40;
3652   Address::ScaleFactor scale = Address::no_scale;
3653   Address::ScaleFactor scale1 = Address::no_scale;
3654   Address::ScaleFactor scale2 = Address::no_scale;
3655 
3656   if (ae != StrIntrinsicNode::LL) {
3657     stride2x2 = 0x20;
3658   }
3659 
3660   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3661     shrl(cnt2, 1);
3662   }
3663   // Compute the minimum of the string lengths and the
3664   // difference of the string lengths (stack).
3665   // Do the conditional move stuff
3666   movl(result, cnt1);
3667   subl(cnt1, cnt2);
3668   push(cnt1);
3669   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3670 
3671   // Is the minimum length zero?
3672   testl(cnt2, cnt2);
3673   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3674   if (ae == StrIntrinsicNode::LL) {
3675     // Load first bytes
3676     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3677     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3678   } else if (ae == StrIntrinsicNode::UU) {
3679     // Load first characters
3680     load_unsigned_short(result, Address(str1, 0));
3681     load_unsigned_short(cnt1, Address(str2, 0));
3682   } else {
3683     load_unsigned_byte(result, Address(str1, 0));
3684     load_unsigned_short(cnt1, Address(str2, 0));
3685   }
3686   subl(result, cnt1);
3687   jcc(Assembler::notZero,  POP_LABEL);
3688 
3689   if (ae == StrIntrinsicNode::UU) {
3690     // Divide length by 2 to get number of chars
3691     shrl(cnt2, 1);
3692   }
3693   cmpl(cnt2, 1);
3694   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3695 
3696   // Check if the strings start at the same location and setup scale and stride
3697   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3698     cmpptr(str1, str2);
3699     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3700     if (ae == StrIntrinsicNode::LL) {
3701       scale = Address::times_1;
3702       stride = 16;
3703     } else {
3704       scale = Address::times_2;
3705       stride = 8;
3706     }
3707   } else {
3708     scale1 = Address::times_1;
3709     scale2 = Address::times_2;
3710     // scale not used
3711     stride = 8;
3712   }
3713 
3714   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3715     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3716     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3717     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3718     Label COMPARE_TAIL_LONG;
3719     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only AVX3
3720 
3721     int pcmpmask = 0x19;
3722     if (ae == StrIntrinsicNode::LL) {
3723       pcmpmask &= ~0x01;
3724     }
3725 
3726     // Setup to compare 16-chars (32-bytes) vectors,
3727     // start from first character again because it has aligned address.
3728     if (ae == StrIntrinsicNode::LL) {
3729       stride2 = 32;
3730     } else {
3731       stride2 = 16;
3732     }
3733     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3734       adr_stride = stride << scale;
3735     } else {
3736       adr_stride1 = 8;  //stride << scale1;
3737       adr_stride2 = 16; //stride << scale2;
3738     }
3739 
3740     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3741     // rax and rdx are used by pcmpestri as elements counters
3742     movl(result, cnt2);
3743     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3744     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3745 
3746     // fast path : compare first 2 8-char vectors.
3747     bind(COMPARE_16_CHARS);
3748     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3749       movdqu(vec1, Address(str1, 0));
3750     } else {
3751       pmovzxbw(vec1, Address(str1, 0));
3752     }
3753     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3754     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3755 
3756     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3757       movdqu(vec1, Address(str1, adr_stride));
3758       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3759     } else {
3760       pmovzxbw(vec1, Address(str1, adr_stride1));
3761       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3762     }
3763     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3764     addl(cnt1, stride);
3765 
3766     // Compare the characters at index in cnt1
3767     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3768     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3769     subl(result, cnt2);
3770     jmp(POP_LABEL);
3771 
3772     // Setup the registers to start vector comparison loop
3773     bind(COMPARE_WIDE_VECTORS);
3774     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3775       lea(str1, Address(str1, result, scale));
3776       lea(str2, Address(str2, result, scale));
3777     } else {
3778       lea(str1, Address(str1, result, scale1));
3779       lea(str2, Address(str2, result, scale2));
3780     }
3781     subl(result, stride2);
3782     subl(cnt2, stride2);
3783     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3784     negptr(result);
3785 
3786     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3787     bind(COMPARE_WIDE_VECTORS_LOOP);
3788 
3789     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3790       cmpl(cnt2, stride2x2);
3791       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3792       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3793       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3794 
3795       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3796       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3797         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3798         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3799       } else {
3800         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3801         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3802       }
3803       kortestql(mask, mask);
3804       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3805       addptr(result, stride2x2);  // update since we already compared at this addr
3806       subl(cnt2, stride2x2);      // and sub the size too
3807       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3808 
3809       vpxor(vec1, vec1);
3810       jmpb(COMPARE_WIDE_TAIL);
3811     }//if (VM_Version::supports_avx512vlbw())
3812 
3813     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3814     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3815       vmovdqu(vec1, Address(str1, result, scale));
3816       vpxor(vec1, Address(str2, result, scale));
3817     } else {
3818       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3819       vpxor(vec1, Address(str2, result, scale2));
3820     }
3821     vptest(vec1, vec1);
3822     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3823     addptr(result, stride2);
3824     subl(cnt2, stride2);
3825     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3826     // clean upper bits of YMM registers
3827     vpxor(vec1, vec1);
3828 
3829     // compare wide vectors tail
3830     bind(COMPARE_WIDE_TAIL);
3831     testptr(result, result);
3832     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3833 
3834     movl(result, stride2);
3835     movl(cnt2, result);
3836     negptr(result);
3837     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3838 
3839     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3840     bind(VECTOR_NOT_EQUAL);
3841     // clean upper bits of YMM registers
3842     vpxor(vec1, vec1);
3843     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3844       lea(str1, Address(str1, result, scale));
3845       lea(str2, Address(str2, result, scale));
3846     } else {
3847       lea(str1, Address(str1, result, scale1));
3848       lea(str2, Address(str2, result, scale2));
3849     }
3850     jmp(COMPARE_16_CHARS);
3851 
3852     // Compare tail chars, length between 1 to 15 chars
3853     bind(COMPARE_TAIL_LONG);
3854     movl(cnt2, result);
3855     cmpl(cnt2, stride);
3856     jcc(Assembler::less, COMPARE_SMALL_STR);
3857 
3858     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3859       movdqu(vec1, Address(str1, 0));
3860     } else {
3861       pmovzxbw(vec1, Address(str1, 0));
3862     }
3863     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3864     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3865     subptr(cnt2, stride);
3866     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3867     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3868       lea(str1, Address(str1, result, scale));
3869       lea(str2, Address(str2, result, scale));
3870     } else {
3871       lea(str1, Address(str1, result, scale1));
3872       lea(str2, Address(str2, result, scale2));
3873     }
3874     negptr(cnt2);
3875     jmpb(WHILE_HEAD_LABEL);
3876 
3877     bind(COMPARE_SMALL_STR);
3878   } else if (UseSSE42Intrinsics) {
3879     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3880     int pcmpmask = 0x19;
3881     // Setup to compare 8-char (16-byte) vectors,
3882     // start from first character again because it has aligned address.
3883     movl(result, cnt2);
3884     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3885     if (ae == StrIntrinsicNode::LL) {
3886       pcmpmask &= ~0x01;
3887     }
3888     jcc(Assembler::zero, COMPARE_TAIL);
3889     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3890       lea(str1, Address(str1, result, scale));
3891       lea(str2, Address(str2, result, scale));
3892     } else {
3893       lea(str1, Address(str1, result, scale1));
3894       lea(str2, Address(str2, result, scale2));
3895     }
3896     negptr(result);
3897 
3898     // pcmpestri
3899     //   inputs:
3900     //     vec1- substring
3901     //     rax - negative string length (elements count)
3902     //     mem - scanned string
3903     //     rdx - string length (elements count)
3904     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3905     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3906     //   outputs:
3907     //     rcx - first mismatched element index
3908     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3909 
3910     bind(COMPARE_WIDE_VECTORS);
3911     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3912       movdqu(vec1, Address(str1, result, scale));
3913       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3914     } else {
3915       pmovzxbw(vec1, Address(str1, result, scale1));
3916       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3917     }
3918     // After pcmpestri cnt1(rcx) contains mismatched element index
3919 
3920     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3921     addptr(result, stride);
3922     subptr(cnt2, stride);
3923     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3924 
3925     // compare wide vectors tail
3926     testptr(result, result);
3927     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3928 
3929     movl(cnt2, stride);
3930     movl(result, stride);
3931     negptr(result);
3932     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3933       movdqu(vec1, Address(str1, result, scale));
3934       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3935     } else {
3936       pmovzxbw(vec1, Address(str1, result, scale1));
3937       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3938     }
3939     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3940 
3941     // Mismatched characters in the vectors
3942     bind(VECTOR_NOT_EQUAL);
3943     addptr(cnt1, result);
3944     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3945     subl(result, cnt2);
3946     jmpb(POP_LABEL);
3947 
3948     bind(COMPARE_TAIL); // limit is zero
3949     movl(cnt2, result);
3950     // Fallthru to tail compare
3951   }
3952   // Shift str2 and str1 to the end of the arrays, negate min
3953   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3954     lea(str1, Address(str1, cnt2, scale));
3955     lea(str2, Address(str2, cnt2, scale));
3956   } else {
3957     lea(str1, Address(str1, cnt2, scale1));
3958     lea(str2, Address(str2, cnt2, scale2));
3959   }
3960   decrementl(cnt2);  // first character was compared already
3961   negptr(cnt2);
3962 
3963   // Compare the rest of the elements
3964   bind(WHILE_HEAD_LABEL);
3965   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3966   subl(result, cnt1);
3967   jccb(Assembler::notZero, POP_LABEL);
3968   increment(cnt2);
3969   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3970 
3971   // Strings are equal up to min length.  Return the length difference.
3972   bind(LENGTH_DIFF_LABEL);
3973   pop(result);
3974   if (ae == StrIntrinsicNode::UU) {
3975     // Divide diff by 2 to get number of chars
3976     sarl(result, 1);
3977   }
3978   jmpb(DONE_LABEL);
3979 
3980   if (VM_Version::supports_avx512vlbw()) {
3981 
3982     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3983 
3984     kmovql(cnt1, mask);
3985     notq(cnt1);
3986     bsfq(cnt2, cnt1);
3987     if (ae != StrIntrinsicNode::LL) {
3988       // Divide diff by 2 to get number of chars
3989       sarl(cnt2, 1);
3990     }
3991     addq(result, cnt2);
3992     if (ae == StrIntrinsicNode::LL) {
3993       load_unsigned_byte(cnt1, Address(str2, result));
3994       load_unsigned_byte(result, Address(str1, result));
3995     } else if (ae == StrIntrinsicNode::UU) {
3996       load_unsigned_short(cnt1, Address(str2, result, scale));
3997       load_unsigned_short(result, Address(str1, result, scale));
3998     } else {
3999       load_unsigned_short(cnt1, Address(str2, result, scale2));
4000       load_unsigned_byte(result, Address(str1, result, scale1));
4001     }
4002     subl(result, cnt1);
4003     jmpb(POP_LABEL);
4004   }//if (VM_Version::supports_avx512vlbw())
4005 
4006   // Discard the stored length difference
4007   bind(POP_LABEL);
4008   pop(cnt1);
4009 
4010   // That's it
4011   bind(DONE_LABEL);
4012   if(ae == StrIntrinsicNode::UL) {
4013     negl(result);
4014   }
4015 
4016 }
4017 
4018 // Search for Non-ASCII character (Negative byte value) in a byte array,
4019 // return the index of the first such character, otherwise the length
4020 // of the array segment searched.
4021 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4022 //   @IntrinsicCandidate
4023 //   public static int countPositives(byte[] ba, int off, int len) {
4024 //     for (int i = off; i < off + len; i++) {
4025 //       if (ba[i] < 0) {
4026 //         return i - off;
4027 //       }
4028 //     }
4029 //     return len;
4030 //   }
4031 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4032   Register result, Register tmp1,
4033   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4034   // rsi: byte array
4035   // rcx: len
4036   // rax: result
4037   ShortBranchVerifier sbv(this);
4038   assert_different_registers(ary1, len, result, tmp1);
4039   assert_different_registers(vec1, vec2);
4040   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4041 
4042   movl(result, len); // copy
4043   // len == 0
4044   testl(len, len);
4045   jcc(Assembler::zero, DONE);
4046 
4047   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4048     VM_Version::supports_avx512vlbw() &&
4049     VM_Version::supports_bmi2()) {
4050 
4051     Label test_64_loop, test_tail, BREAK_LOOP;
4052     movl(tmp1, len);
4053     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4054 
4055     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4056     andl(len,  0xffffffc0); // vector count (in chars)
4057     jccb(Assembler::zero, test_tail);
4058 
4059     lea(ary1, Address(ary1, len, Address::times_1));
4060     negptr(len);
4061 
4062     bind(test_64_loop);
4063     // Check whether our 64 elements of size byte contain negatives
4064     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4065     kortestql(mask1, mask1);
4066     jcc(Assembler::notZero, BREAK_LOOP);
4067 
4068     addptr(len, 64);
4069     jccb(Assembler::notZero, test_64_loop);
4070 
4071     bind(test_tail);
4072     // bail out when there is nothing to be done
4073     testl(tmp1, -1);
4074     jcc(Assembler::zero, DONE);
4075 
4076 
4077     // check the tail for absense of negatives
4078     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4079     {
4080       Register tmp3_aliased = len;
4081       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4082       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4083       notq(tmp3_aliased);
4084       kmovql(mask2, tmp3_aliased);
4085     }
4086 
4087     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4088     ktestq(mask1, mask2);
4089     jcc(Assembler::zero, DONE);
4090 
4091     // do a full check for negative registers in the tail
4092     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4093                      // ary1 already pointing to the right place
4094     jmpb(TAIL_START);
4095 
4096     bind(BREAK_LOOP);
4097     // At least one byte in the last 64 byte block was negative.
4098     // Set up to look at the last 64 bytes as if they were a tail
4099     lea(ary1, Address(ary1, len, Address::times_1));
4100     addptr(result, len);
4101     // Ignore the very last byte: if all others are positive,
4102     // it must be negative, so we can skip right to the 2+1 byte
4103     // end comparison at this point
4104     orl(result, 63);
4105     movl(len, 63);
4106     // Fallthru to tail compare
4107   } else {
4108 
4109     if (UseAVX >= 2) {
4110       // With AVX2, use 32-byte vector compare
4111       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4112 
4113       // Compare 32-byte vectors
4114       testl(len, 0xffffffe0);   // vector count (in bytes)
4115       jccb(Assembler::zero, TAIL_START);
4116 
4117       andl(len, 0xffffffe0);
4118       lea(ary1, Address(ary1, len, Address::times_1));
4119       negptr(len);
4120 
4121       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4122       movdl(vec2, tmp1);
4123       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4124 
4125       bind(COMPARE_WIDE_VECTORS);
4126       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4127       vptest(vec1, vec2);
4128       jccb(Assembler::notZero, BREAK_LOOP);
4129       addptr(len, 32);
4130       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4131 
4132       testl(result, 0x0000001f);   // any bytes remaining?
4133       jcc(Assembler::zero, DONE);
4134 
4135       // Quick test using the already prepared vector mask
4136       movl(len, result);
4137       andl(len, 0x0000001f);
4138       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4139       vptest(vec1, vec2);
4140       jcc(Assembler::zero, DONE);
4141       // There are zeros, jump to the tail to determine exactly where
4142       jmpb(TAIL_START);
4143 
4144       bind(BREAK_LOOP);
4145       // At least one byte in the last 32-byte vector is negative.
4146       // Set up to look at the last 32 bytes as if they were a tail
4147       lea(ary1, Address(ary1, len, Address::times_1));
4148       addptr(result, len);
4149       // Ignore the very last byte: if all others are positive,
4150       // it must be negative, so we can skip right to the 2+1 byte
4151       // end comparison at this point
4152       orl(result, 31);
4153       movl(len, 31);
4154       // Fallthru to tail compare
4155     } else if (UseSSE42Intrinsics) {
4156       // With SSE4.2, use double quad vector compare
4157       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4158 
4159       // Compare 16-byte vectors
4160       testl(len, 0xfffffff0);   // vector count (in bytes)
4161       jcc(Assembler::zero, TAIL_START);
4162 
4163       andl(len, 0xfffffff0);
4164       lea(ary1, Address(ary1, len, Address::times_1));
4165       negptr(len);
4166 
4167       movl(tmp1, 0x80808080);
4168       movdl(vec2, tmp1);
4169       pshufd(vec2, vec2, 0);
4170 
4171       bind(COMPARE_WIDE_VECTORS);
4172       movdqu(vec1, Address(ary1, len, Address::times_1));
4173       ptest(vec1, vec2);
4174       jccb(Assembler::notZero, BREAK_LOOP);
4175       addptr(len, 16);
4176       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4177 
4178       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4179       jcc(Assembler::zero, DONE);
4180 
4181       // Quick test using the already prepared vector mask
4182       movl(len, result);
4183       andl(len, 0x0000000f);   // tail count (in bytes)
4184       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4185       ptest(vec1, vec2);
4186       jcc(Assembler::zero, DONE);
4187       jmpb(TAIL_START);
4188 
4189       bind(BREAK_LOOP);
4190       // At least one byte in the last 16-byte vector is negative.
4191       // Set up and look at the last 16 bytes as if they were a tail
4192       lea(ary1, Address(ary1, len, Address::times_1));
4193       addptr(result, len);
4194       // Ignore the very last byte: if all others are positive,
4195       // it must be negative, so we can skip right to the 2+1 byte
4196       // end comparison at this point
4197       orl(result, 15);
4198       movl(len, 15);
4199       // Fallthru to tail compare
4200     }
4201   }
4202 
4203   bind(TAIL_START);
4204   // Compare 4-byte vectors
4205   andl(len, 0xfffffffc); // vector count (in bytes)
4206   jccb(Assembler::zero, COMPARE_CHAR);
4207 
4208   lea(ary1, Address(ary1, len, Address::times_1));
4209   negptr(len);
4210 
4211   bind(COMPARE_VECTORS);
4212   movl(tmp1, Address(ary1, len, Address::times_1));
4213   andl(tmp1, 0x80808080);
4214   jccb(Assembler::notZero, TAIL_ADJUST);
4215   addptr(len, 4);
4216   jccb(Assembler::notZero, COMPARE_VECTORS);
4217 
4218   // Compare trailing char (final 2-3 bytes), if any
4219   bind(COMPARE_CHAR);
4220 
4221   testl(result, 0x2);   // tail  char
4222   jccb(Assembler::zero, COMPARE_BYTE);
4223   load_unsigned_short(tmp1, Address(ary1, 0));
4224   andl(tmp1, 0x00008080);
4225   jccb(Assembler::notZero, CHAR_ADJUST);
4226   lea(ary1, Address(ary1, 2));
4227 
4228   bind(COMPARE_BYTE);
4229   testl(result, 0x1);   // tail  byte
4230   jccb(Assembler::zero, DONE);
4231   load_unsigned_byte(tmp1, Address(ary1, 0));
4232   testl(tmp1, 0x00000080);
4233   jccb(Assembler::zero, DONE);
4234   subptr(result, 1);
4235   jmpb(DONE);
4236 
4237   bind(TAIL_ADJUST);
4238   // there are negative bits in the last 4 byte block.
4239   // Adjust result and check the next three bytes
4240   addptr(result, len);
4241   orl(result, 3);
4242   lea(ary1, Address(ary1, len, Address::times_1));
4243   jmpb(COMPARE_CHAR);
4244 
4245   bind(CHAR_ADJUST);
4246   // We are looking at a char + optional byte tail, and found that one
4247   // of the bytes in the char is negative. Adjust the result, check the
4248   // first byte and readjust if needed.
4249   andl(result, 0xfffffffc);
4250   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4251   jccb(Assembler::notZero, DONE);
4252   addptr(result, 1);
4253 
4254   // That's it
4255   bind(DONE);
4256   if (UseAVX >= 2) {
4257     // clean upper bits of YMM registers
4258     vpxor(vec1, vec1);
4259     vpxor(vec2, vec2);
4260   }
4261 }
4262 
4263 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4264 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4265                                       Register limit, Register result, Register chr,
4266                                       XMMRegister vec1, XMMRegister vec2, bool is_char,
4267                                       KRegister mask, bool expand_ary2) {
4268   // for expand_ary2, limit is the (smaller) size of the second array.
4269   ShortBranchVerifier sbv(this);
4270   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4271 
4272   assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4273          "Expansion only implemented for AVX2");
4274 
4275   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4276   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4277 
4278   Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4279   int scaleIncr = expand_ary2 ? 8 : 16;
4280 
4281   if (is_array_equ) {
4282     // Check the input args
4283     cmpoop(ary1, ary2);
4284     jcc(Assembler::equal, TRUE_LABEL);
4285 
4286     // Need additional checks for arrays_equals.
4287     testptr(ary1, ary1);
4288     jcc(Assembler::zero, FALSE_LABEL);
4289     testptr(ary2, ary2);
4290     jcc(Assembler::zero, FALSE_LABEL);
4291 
4292     // Check the lengths
4293     movl(limit, Address(ary1, length_offset));
4294     cmpl(limit, Address(ary2, length_offset));
4295     jcc(Assembler::notEqual, FALSE_LABEL);
4296   }
4297 
4298   // count == 0
4299   testl(limit, limit);
4300   jcc(Assembler::zero, TRUE_LABEL);
4301 
4302   if (is_array_equ) {
4303     // Load array address
4304     lea(ary1, Address(ary1, base_offset));
4305     lea(ary2, Address(ary2, base_offset));
4306   }
4307 
4308   if (is_array_equ && is_char) {
4309     // arrays_equals when used for char[].
4310     shll(limit, 1);      // byte count != 0
4311   }
4312   movl(result, limit); // copy
4313 
4314   if (UseAVX >= 2) {
4315     // With AVX2, use 32-byte vector compare
4316     Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4317 
4318     // Compare 32-byte vectors
4319     if (expand_ary2) {
4320       andl(result, 0x0000000f);  //   tail count (in bytes)
4321       andl(limit, 0xfffffff0);   // vector count (in bytes)
4322       jcc(Assembler::zero, COMPARE_TAIL);
4323     } else {
4324       andl(result, 0x0000001f);  //   tail count (in bytes)
4325       andl(limit, 0xffffffe0);   // vector count (in bytes)
4326       jcc(Assembler::zero, COMPARE_TAIL_16);
4327     }
4328 
4329     lea(ary1, Address(ary1, limit, scaleFactor));
4330     lea(ary2, Address(ary2, limit, Address::times_1));
4331     negptr(limit);
4332 
4333     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4334       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4335 
4336       cmpl(limit, -64);
4337       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4338 
4339       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4340 
4341       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4342       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4343       kortestql(mask, mask);
4344       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4345       addptr(limit, 64);  // update since we already compared at this addr
4346       cmpl(limit, -64);
4347       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4348 
4349       // At this point we may still need to compare -limit+result bytes.
4350       // We could execute the next two instruction and just continue via non-wide path:
4351       //  cmpl(limit, 0);
4352       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4353       // But since we stopped at the points ary{1,2}+limit which are
4354       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4355       // (|limit| <= 32 and result < 32),
4356       // we may just compare the last 64 bytes.
4357       //
4358       addptr(result, -64);   // it is safe, bc we just came from this area
4359       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4360       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4361       kortestql(mask, mask);
4362       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4363 
4364       jmp(TRUE_LABEL);
4365 
4366       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4367 
4368     }//if (VM_Version::supports_avx512vlbw())
4369 
4370     bind(COMPARE_WIDE_VECTORS);
4371     vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4372     if (expand_ary2) {
4373       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4374     } else {
4375       vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4376     }
4377     vpxor(vec1, vec2);
4378 
4379     vptest(vec1, vec1);
4380     jcc(Assembler::notZero, FALSE_LABEL);
4381     addptr(limit, scaleIncr * 2);
4382     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4383 
4384     testl(result, result);
4385     jcc(Assembler::zero, TRUE_LABEL);
4386 
4387     vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4388     if (expand_ary2) {
4389       vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4390     } else {
4391       vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4392     }
4393     vpxor(vec1, vec2);
4394 
4395     vptest(vec1, vec1);
4396     jcc(Assembler::notZero, FALSE_LABEL);
4397     jmp(TRUE_LABEL);
4398 
4399     bind(COMPARE_TAIL_16); // limit is zero
4400     movl(limit, result);
4401 
4402     // Compare 16-byte chunks
4403     andl(result, 0x0000000f);  //   tail count (in bytes)
4404     andl(limit, 0xfffffff0);   // vector count (in bytes)
4405     jcc(Assembler::zero, COMPARE_TAIL);
4406 
4407     lea(ary1, Address(ary1, limit, scaleFactor));
4408     lea(ary2, Address(ary2, limit, Address::times_1));
4409     negptr(limit);
4410 
4411     bind(COMPARE_WIDE_VECTORS_16);
4412     movdqu(vec1, Address(ary1, limit, scaleFactor));
4413     if (expand_ary2) {
4414       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4415     } else {
4416       movdqu(vec2, Address(ary2, limit, Address::times_1));
4417     }
4418     pxor(vec1, vec2);
4419 
4420     ptest(vec1, vec1);
4421     jcc(Assembler::notZero, FALSE_LABEL);
4422     addptr(limit, scaleIncr);
4423     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4424 
4425     bind(COMPARE_TAIL); // limit is zero
4426     movl(limit, result);
4427     // Fallthru to tail compare
4428   } else if (UseSSE42Intrinsics) {
4429     // With SSE4.2, use double quad vector compare
4430     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4431 
4432     // Compare 16-byte vectors
4433     andl(result, 0x0000000f);  //   tail count (in bytes)
4434     andl(limit, 0xfffffff0);   // vector count (in bytes)
4435     jcc(Assembler::zero, COMPARE_TAIL);
4436 
4437     lea(ary1, Address(ary1, limit, Address::times_1));
4438     lea(ary2, Address(ary2, limit, Address::times_1));
4439     negptr(limit);
4440 
4441     bind(COMPARE_WIDE_VECTORS);
4442     movdqu(vec1, Address(ary1, limit, Address::times_1));
4443     movdqu(vec2, Address(ary2, limit, Address::times_1));
4444     pxor(vec1, vec2);
4445 
4446     ptest(vec1, vec1);
4447     jcc(Assembler::notZero, FALSE_LABEL);
4448     addptr(limit, 16);
4449     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4450 
4451     testl(result, result);
4452     jcc(Assembler::zero, TRUE_LABEL);
4453 
4454     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4455     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4456     pxor(vec1, vec2);
4457 
4458     ptest(vec1, vec1);
4459     jccb(Assembler::notZero, FALSE_LABEL);
4460     jmpb(TRUE_LABEL);
4461 
4462     bind(COMPARE_TAIL); // limit is zero
4463     movl(limit, result);
4464     // Fallthru to tail compare
4465   }
4466 
4467   // Compare 4-byte vectors
4468   if (expand_ary2) {
4469     testl(result, result);
4470     jccb(Assembler::zero, TRUE_LABEL);
4471   } else {
4472     andl(limit, 0xfffffffc); // vector count (in bytes)
4473     jccb(Assembler::zero, COMPARE_CHAR);
4474   }
4475 
4476   lea(ary1, Address(ary1, limit, scaleFactor));
4477   lea(ary2, Address(ary2, limit, Address::times_1));
4478   negptr(limit);
4479 
4480   bind(COMPARE_VECTORS);
4481   if (expand_ary2) {
4482     // There are no "vector" operations for bytes to shorts
4483     movzbl(chr, Address(ary2, limit, Address::times_1));
4484     cmpw(Address(ary1, limit, Address::times_2), chr);
4485     jccb(Assembler::notEqual, FALSE_LABEL);
4486     addptr(limit, 1);
4487     jcc(Assembler::notZero, COMPARE_VECTORS);
4488     jmp(TRUE_LABEL);
4489   } else {
4490     movl(chr, Address(ary1, limit, Address::times_1));
4491     cmpl(chr, Address(ary2, limit, Address::times_1));
4492     jccb(Assembler::notEqual, FALSE_LABEL);
4493     addptr(limit, 4);
4494     jcc(Assembler::notZero, COMPARE_VECTORS);
4495   }
4496 
4497   // Compare trailing char (final 2 bytes), if any
4498   bind(COMPARE_CHAR);
4499   testl(result, 0x2);   // tail  char
4500   jccb(Assembler::zero, COMPARE_BYTE);
4501   load_unsigned_short(chr, Address(ary1, 0));
4502   load_unsigned_short(limit, Address(ary2, 0));
4503   cmpl(chr, limit);
4504   jccb(Assembler::notEqual, FALSE_LABEL);
4505 
4506   if (is_array_equ && is_char) {
4507     bind(COMPARE_BYTE);
4508   } else {
4509     lea(ary1, Address(ary1, 2));
4510     lea(ary2, Address(ary2, 2));
4511 
4512     bind(COMPARE_BYTE);
4513     testl(result, 0x1);   // tail  byte
4514     jccb(Assembler::zero, TRUE_LABEL);
4515     load_unsigned_byte(chr, Address(ary1, 0));
4516     load_unsigned_byte(limit, Address(ary2, 0));
4517     cmpl(chr, limit);
4518     jccb(Assembler::notEqual, FALSE_LABEL);
4519   }
4520   bind(TRUE_LABEL);
4521   movl(result, 1);   // return true
4522   jmpb(DONE);
4523 
4524   bind(FALSE_LABEL);
4525   xorl(result, result); // return false
4526 
4527   // That's it
4528   bind(DONE);
4529   if (UseAVX >= 2) {
4530     // clean upper bits of YMM registers
4531     vpxor(vec1, vec1);
4532     vpxor(vec2, vec2);
4533   }
4534 }
4535 
4536 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4537 #define __ masm.
4538   Register dst = stub.data<0>();
4539   XMMRegister src = stub.data<1>();
4540   address target = stub.data<2>();
4541   __ bind(stub.entry());
4542   __ subptr(rsp, 8);
4543   __ movdbl(Address(rsp), src);
4544   __ call(RuntimeAddress(target));
4545   // APX REX2 encoding for pop(dst) increases the stub size by 1 byte.
4546   __ pop(dst);
4547   __ jmp(stub.continuation());
4548 #undef __
4549 }
4550 
4551 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4552   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4553   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4554 
4555   address slowpath_target;
4556   if (dst_bt == T_INT) {
4557     if (src_bt == T_FLOAT) {
4558       cvttss2sil(dst, src);
4559       cmpl(dst, 0x80000000);
4560       slowpath_target = StubRoutines::x86::f2i_fixup();
4561     } else {
4562       cvttsd2sil(dst, src);
4563       cmpl(dst, 0x80000000);
4564       slowpath_target = StubRoutines::x86::d2i_fixup();
4565     }
4566   } else {
4567     if (src_bt == T_FLOAT) {
4568       cvttss2siq(dst, src);
4569       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4570       slowpath_target = StubRoutines::x86::f2l_fixup();
4571     } else {
4572       cvttsd2siq(dst, src);
4573       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4574       slowpath_target = StubRoutines::x86::d2l_fixup();
4575     }
4576   }
4577 
4578   // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte.
4579   int max_size = 23 + (UseAPX ? 1 : 0);
4580   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath);
4581   jcc(Assembler::equal, stub->entry());
4582   bind(stub->continuation());
4583 }
4584 
4585 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4586                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4587   switch(ideal_opc) {
4588     case Op_LShiftVS:
4589       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4590     case Op_LShiftVI:
4591       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4592     case Op_LShiftVL:
4593       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4594     case Op_RShiftVS:
4595       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4596     case Op_RShiftVI:
4597       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4598     case Op_RShiftVL:
4599       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4600     case Op_URShiftVS:
4601       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4602     case Op_URShiftVI:
4603       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4604     case Op_URShiftVL:
4605       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4606     case Op_RotateRightV:
4607       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4608     case Op_RotateLeftV:
4609       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4610     default:
4611       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4612       break;
4613   }
4614 }
4615 
4616 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4617                                                XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4618   if (is_unsigned) {
4619     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4620   } else {
4621     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4622   }
4623 }
4624 
4625 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4626                                                       XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4627   switch (elem_bt) {
4628     case T_BYTE:
4629       if (ideal_opc == Op_SaturatingAddV) {
4630         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4631       } else {
4632         assert(ideal_opc == Op_SaturatingSubV, "");
4633         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4634       }
4635       break;
4636     case T_SHORT:
4637       if (ideal_opc == Op_SaturatingAddV) {
4638         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4639       } else {
4640         assert(ideal_opc == Op_SaturatingSubV, "");
4641         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4642       }
4643       break;
4644     default:
4645       fatal("Unsupported type %s", type2name(elem_bt));
4646       break;
4647   }
4648 }
4649 
4650 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4651                                                         XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4652   switch (elem_bt) {
4653     case T_BYTE:
4654       if (ideal_opc == Op_SaturatingAddV) {
4655         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4656       } else {
4657         assert(ideal_opc == Op_SaturatingSubV, "");
4658         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4659       }
4660       break;
4661     case T_SHORT:
4662       if (ideal_opc == Op_SaturatingAddV) {
4663         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4664       } else {
4665         assert(ideal_opc == Op_SaturatingSubV, "");
4666         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4667       }
4668       break;
4669     default:
4670       fatal("Unsupported type %s", type2name(elem_bt));
4671       break;
4672   }
4673 }
4674 
4675 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4676                                                Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4677   if (is_unsigned) {
4678     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4679   } else {
4680     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4681   }
4682 }
4683 
4684 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4685                                                       XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4686   switch (elem_bt) {
4687     case T_BYTE:
4688       if (ideal_opc == Op_SaturatingAddV) {
4689         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4690       } else {
4691         assert(ideal_opc == Op_SaturatingSubV, "");
4692         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4693       }
4694       break;
4695     case T_SHORT:
4696       if (ideal_opc == Op_SaturatingAddV) {
4697         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4698       } else {
4699         assert(ideal_opc == Op_SaturatingSubV, "");
4700         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4701       }
4702       break;
4703     default:
4704       fatal("Unsupported type %s", type2name(elem_bt));
4705       break;
4706   }
4707 }
4708 
4709 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4710                                                         XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4711   switch (elem_bt) {
4712     case T_BYTE:
4713       if (ideal_opc == Op_SaturatingAddV) {
4714         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4715       } else {
4716         assert(ideal_opc == Op_SaturatingSubV, "");
4717         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4718       }
4719       break;
4720     case T_SHORT:
4721       if (ideal_opc == Op_SaturatingAddV) {
4722         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4723       } else {
4724         assert(ideal_opc == Op_SaturatingSubV, "");
4725         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4726       }
4727       break;
4728     default:
4729       fatal("Unsupported type %s", type2name(elem_bt));
4730       break;
4731   }
4732 }
4733 
4734 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4735                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4736                                     bool is_varshift) {
4737   switch (ideal_opc) {
4738     case Op_AddVB:
4739       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4740     case Op_AddVS:
4741       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4742     case Op_AddVI:
4743       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4744     case Op_AddVL:
4745       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4746     case Op_AddVF:
4747       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4748     case Op_AddVD:
4749       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4750     case Op_SubVB:
4751       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4752     case Op_SubVS:
4753       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4754     case Op_SubVI:
4755       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4756     case Op_SubVL:
4757       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4758     case Op_SubVF:
4759       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4760     case Op_SubVD:
4761       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4762     case Op_MulVS:
4763       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4764     case Op_MulVI:
4765       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4766     case Op_MulVL:
4767       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4768     case Op_MulVF:
4769       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4770     case Op_MulVD:
4771       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4772     case Op_DivVF:
4773       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4774     case Op_DivVD:
4775       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4776     case Op_SqrtVF:
4777       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4778     case Op_SqrtVD:
4779       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4780     case Op_AbsVB:
4781       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4782     case Op_AbsVS:
4783       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4784     case Op_AbsVI:
4785       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4786     case Op_AbsVL:
4787       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4788     case Op_FmaVF:
4789       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4790     case Op_FmaVD:
4791       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4792     case Op_VectorRearrange:
4793       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4794     case Op_LShiftVS:
4795       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4796     case Op_LShiftVI:
4797       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4798     case Op_LShiftVL:
4799       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4800     case Op_RShiftVS:
4801       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4802     case Op_RShiftVI:
4803       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4804     case Op_RShiftVL:
4805       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4806     case Op_URShiftVS:
4807       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4808     case Op_URShiftVI:
4809       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4810     case Op_URShiftVL:
4811       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4812     case Op_RotateLeftV:
4813       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4814     case Op_RotateRightV:
4815       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4816     case Op_MaxV:
4817       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4818     case Op_MinV:
4819       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4820     case Op_UMinV:
4821       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4822     case Op_UMaxV:
4823       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4824     case Op_XorV:
4825       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4826     case Op_OrV:
4827       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4828     case Op_AndV:
4829       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4830     default:
4831       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4832       break;
4833   }
4834 }
4835 
4836 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4837                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4838   switch (ideal_opc) {
4839     case Op_AddVB:
4840       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4841     case Op_AddVS:
4842       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4843     case Op_AddVI:
4844       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4845     case Op_AddVL:
4846       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4847     case Op_AddVF:
4848       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4849     case Op_AddVD:
4850       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4851     case Op_SubVB:
4852       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4853     case Op_SubVS:
4854       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4855     case Op_SubVI:
4856       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4857     case Op_SubVL:
4858       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4859     case Op_SubVF:
4860       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4861     case Op_SubVD:
4862       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4863     case Op_MulVS:
4864       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4865     case Op_MulVI:
4866       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4867     case Op_MulVL:
4868       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4869     case Op_MulVF:
4870       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4871     case Op_MulVD:
4872       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4873     case Op_DivVF:
4874       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4875     case Op_DivVD:
4876       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4877     case Op_FmaVF:
4878       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4879     case Op_FmaVD:
4880       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4881     case Op_MaxV:
4882       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4883     case Op_MinV:
4884       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4885     case Op_UMaxV:
4886       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4887     case Op_UMinV:
4888       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4889     case Op_XorV:
4890       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4891     case Op_OrV:
4892       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4893     case Op_AndV:
4894       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4895     default:
4896       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4897       break;
4898   }
4899 }
4900 
4901 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4902                                   KRegister src1, KRegister src2) {
4903   BasicType etype = T_ILLEGAL;
4904   switch(mask_len) {
4905     case 2:
4906     case 4:
4907     case 8:  etype = T_BYTE; break;
4908     case 16: etype = T_SHORT; break;
4909     case 32: etype = T_INT; break;
4910     case 64: etype = T_LONG; break;
4911     default: fatal("Unsupported type"); break;
4912   }
4913   assert(etype != T_ILLEGAL, "");
4914   switch(ideal_opc) {
4915     case Op_AndVMask:
4916       kand(etype, dst, src1, src2); break;
4917     case Op_OrVMask:
4918       kor(etype, dst, src1, src2); break;
4919     case Op_XorVMask:
4920       kxor(etype, dst, src1, src2); break;
4921     default:
4922       fatal("Unsupported masked operation"); break;
4923   }
4924 }
4925 
4926 /*
4927  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4928  * If src is NaN, the result is 0.
4929  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4930  * the result is equal to the value of Integer.MIN_VALUE.
4931  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4932  * the result is equal to the value of Integer.MAX_VALUE.
4933  */
4934 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4935                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4936                                                                    Register rscratch, AddressLiteral float_sign_flip,
4937                                                                    int vec_enc) {
4938   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4939   Label done;
4940   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4941   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4942   vptest(xtmp2, xtmp2, vec_enc);
4943   jccb(Assembler::equal, done);
4944 
4945   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4946   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4947 
4948   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4949   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4950   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4951 
4952   // Recompute the mask for remaining special value.
4953   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4954   // Extract SRC values corresponding to TRUE mask lanes.
4955   vpand(xtmp4, xtmp2, src, vec_enc);
4956   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4957   // values are set.
4958   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4959 
4960   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4961   bind(done);
4962 }
4963 
4964 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4965                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4966                                                                     Register rscratch, AddressLiteral float_sign_flip,
4967                                                                     int vec_enc) {
4968   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4969   Label done;
4970   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4971   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4972   kortestwl(ktmp1, ktmp1);
4973   jccb(Assembler::equal, done);
4974 
4975   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4976   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4977   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4978 
4979   kxorwl(ktmp1, ktmp1, ktmp2);
4980   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4981   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4982   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4983   bind(done);
4984 }
4985 
4986 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4987                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4988                                                                      Register rscratch, AddressLiteral double_sign_flip,
4989                                                                      int vec_enc) {
4990   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4991 
4992   Label done;
4993   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4994   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4995   kortestwl(ktmp1, ktmp1);
4996   jccb(Assembler::equal, done);
4997 
4998   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4999   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5000   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5001 
5002   kxorwl(ktmp1, ktmp1, ktmp2);
5003   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5004   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5005   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5006   bind(done);
5007 }
5008 
5009 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5010                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5011                                                                      Register rscratch, AddressLiteral float_sign_flip,
5012                                                                      int vec_enc) {
5013   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5014   Label done;
5015   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5016   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5017   kortestwl(ktmp1, ktmp1);
5018   jccb(Assembler::equal, done);
5019 
5020   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5021   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5022   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5023 
5024   kxorwl(ktmp1, ktmp1, ktmp2);
5025   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5026   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5027   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5028   bind(done);
5029 }
5030 
5031 /*
5032  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5033  * If src is NaN, the result is 0.
5034  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
5035  * the result is equal to the value of Long.MIN_VALUE.
5036  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
5037  * the result is equal to the value of Long.MAX_VALUE.
5038  */
5039 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5040                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5041                                                                       Register rscratch, AddressLiteral double_sign_flip,
5042                                                                       int vec_enc) {
5043   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5044 
5045   Label done;
5046   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5047   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
5048   kortestwl(ktmp1, ktmp1);
5049   jccb(Assembler::equal, done);
5050 
5051   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5052   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5053   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5054 
5055   kxorwl(ktmp1, ktmp1, ktmp2);
5056   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5057   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5058   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5059   bind(done);
5060 }
5061 
5062 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5063                                                              XMMRegister xtmp, int index, int vec_enc) {
5064    assert(vec_enc < Assembler::AVX_512bit, "");
5065    if (vec_enc == Assembler::AVX_256bit) {
5066      vextractf128_high(xtmp, src);
5067      vshufps(dst, src, xtmp, index, vec_enc);
5068    } else {
5069      vshufps(dst, src, zero, index, vec_enc);
5070    }
5071 }
5072 
5073 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5074                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5075                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
5076   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5077 
5078   Label done;
5079   // Compare the destination lanes with float_sign_flip
5080   // value to get mask for all special values.
5081   movdqu(xtmp1, float_sign_flip, rscratch);
5082   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5083   ptest(xtmp2, xtmp2);
5084   jccb(Assembler::equal, done);
5085 
5086   // Flip float_sign_flip to get max integer value.
5087   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5088   pxor(xtmp1, xtmp4);
5089 
5090   // Set detination lanes corresponding to unordered source lanes as zero.
5091   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5092   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5093 
5094   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5095   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5096   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5097 
5098   // Recompute the mask for remaining special value.
5099   pxor(xtmp2, xtmp3);
5100   // Extract mask corresponding to non-negative source lanes.
5101   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5102 
5103   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5104   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5105   pand(xtmp3, xtmp2);
5106 
5107   // Replace destination lanes holding special value(0x80000000) with max int
5108   // if corresponding source lane holds a +ve value.
5109   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5110   bind(done);
5111 }
5112 
5113 
5114 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5115                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5116   switch(to_elem_bt) {
5117     case T_SHORT:
5118       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5119       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5120       vpackusdw(dst, dst, zero, vec_enc);
5121       if (vec_enc == Assembler::AVX_256bit) {
5122         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5123       }
5124       break;
5125     case  T_BYTE:
5126       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5127       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5128       vpackusdw(dst, dst, zero, vec_enc);
5129       if (vec_enc == Assembler::AVX_256bit) {
5130         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5131       }
5132       vpackuswb(dst, dst, zero, vec_enc);
5133       break;
5134     default: assert(false, "Unexpected basic type for target of vector cast int to subword: %s", type2name(to_elem_bt));
5135   }
5136 }
5137 
5138 /*
5139  * Algorithm for vector D2L and F2I conversions (AVX 10.2 unsupported):-
5140  * a) Perform vector D2L/F2I cast.
5141  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5142  *    It signifies that source value could be any of the special floating point
5143  *    values(NaN,-Inf,Inf,Max,-Min).
5144  * c) Set destination to zero if source is NaN value.
5145  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5146  */
5147 
5148 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5149                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5150                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5151   int to_elem_sz = type2aelembytes(to_elem_bt);
5152   assert(to_elem_sz <= 4, "");
5153   vcvttps2dq(dst, src, vec_enc);
5154   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5155   if (to_elem_sz < 4) {
5156     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5157     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5158   }
5159 }
5160 
5161 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5162                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5163                                             Register rscratch, int vec_enc) {
5164   int to_elem_sz = type2aelembytes(to_elem_bt);
5165   assert(to_elem_sz <= 4, "");
5166   vcvttps2dq(dst, src, vec_enc);
5167   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5168   switch(to_elem_bt) {
5169     case T_INT:
5170       break;
5171     case T_SHORT:
5172       evpmovdw(dst, dst, vec_enc);
5173       break;
5174     case T_BYTE:
5175       evpmovdb(dst, dst, vec_enc);
5176       break;
5177     default: assert(false, "Unexpected basic type for target of vector castF2X EVEX: %s", type2name(to_elem_bt));
5178   }
5179 }
5180 
5181 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5182                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5183                                             Register rscratch, int vec_enc) {
5184   evcvttps2qq(dst, src, vec_enc);
5185   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5186 }
5187 
5188 // Handling for downcasting from double to integer or sub-word types on AVX2.
5189 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5190                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5191                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5192   int to_elem_sz = type2aelembytes(to_elem_bt);
5193   assert(to_elem_sz < 8, "");
5194   vcvttpd2dq(dst, src, vec_enc);
5195   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5196                                               float_sign_flip, vec_enc);
5197   if (to_elem_sz < 4) {
5198     // xtmp4 holds all zero lanes.
5199     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5200   }
5201 }
5202 
5203 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5204                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5205                                             KRegister ktmp2, AddressLiteral sign_flip,
5206                                             Register rscratch, int vec_enc) {
5207   if (VM_Version::supports_avx512dq()) {
5208     evcvttpd2qq(dst, src, vec_enc);
5209     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5210     switch(to_elem_bt) {
5211       case T_LONG:
5212         break;
5213       case T_INT:
5214         evpmovsqd(dst, dst, vec_enc);
5215         break;
5216       case T_SHORT:
5217         evpmovsqd(dst, dst, vec_enc);
5218         evpmovdw(dst, dst, vec_enc);
5219         break;
5220       case T_BYTE:
5221         evpmovsqd(dst, dst, vec_enc);
5222         evpmovdb(dst, dst, vec_enc);
5223         break;
5224       default: assert(false, "Unexpected basic type for target of vector castD2X AVX512DQ EVEX: %s", type2name(to_elem_bt));
5225     }
5226   } else {
5227     assert(type2aelembytes(to_elem_bt) <= 4, "");
5228     vcvttpd2dq(dst, src, vec_enc);
5229     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5230     switch(to_elem_bt) {
5231       case T_INT:
5232         break;
5233       case T_SHORT:
5234         evpmovdw(dst, dst, vec_enc);
5235         break;
5236       case T_BYTE:
5237         evpmovdb(dst, dst, vec_enc);
5238         break;
5239       default: assert(false, "Unexpected basic type for target of vector castD2X EVEX: %s", type2name(to_elem_bt));
5240     }
5241   }
5242 }
5243 
5244 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5245   switch(to_elem_bt) {
5246     case T_LONG:
5247       evcvttps2qqs(dst, src, vec_enc);
5248       break;
5249     case T_INT:
5250       evcvttps2dqs(dst, src, vec_enc);
5251       break;
5252     case T_SHORT:
5253       evcvttps2dqs(dst, src, vec_enc);
5254       evpmovdw(dst, dst, vec_enc);
5255       break;
5256     case T_BYTE:
5257       evcvttps2dqs(dst, src, vec_enc);
5258       evpmovdb(dst, dst, vec_enc);
5259       break;
5260     default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (reg src): %s", type2name(to_elem_bt));
5261   }
5262 }
5263 
5264 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5265   switch(to_elem_bt) {
5266     case T_LONG:
5267       evcvttps2qqs(dst, src, vec_enc);
5268       break;
5269     case T_INT:
5270       evcvttps2dqs(dst, src, vec_enc);
5271       break;
5272     case T_SHORT:
5273       evcvttps2dqs(dst, src, vec_enc);
5274       evpmovdw(dst, dst, vec_enc);
5275       break;
5276     case T_BYTE:
5277       evcvttps2dqs(dst, src, vec_enc);
5278       evpmovdb(dst, dst, vec_enc);
5279       break;
5280     default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (mem src): %s", type2name(to_elem_bt));
5281   }
5282 }
5283 
5284 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5285   switch(to_elem_bt) {
5286     case T_LONG:
5287       evcvttpd2qqs(dst, src, vec_enc);
5288       break;
5289     case T_INT:
5290       evcvttpd2dqs(dst, src, vec_enc);
5291       break;
5292     case T_SHORT:
5293       evcvttpd2dqs(dst, src, vec_enc);
5294       evpmovdw(dst, dst, vec_enc);
5295       break;
5296     case T_BYTE:
5297       evcvttpd2dqs(dst, src, vec_enc);
5298       evpmovdb(dst, dst, vec_enc);
5299       break;
5300     default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (reg src): %s", type2name(to_elem_bt));
5301   }
5302 }
5303 
5304 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5305   switch(to_elem_bt) {
5306     case T_LONG:
5307       evcvttpd2qqs(dst, src, vec_enc);
5308       break;
5309     case T_INT:
5310       evcvttpd2dqs(dst, src, vec_enc);
5311       break;
5312     case T_SHORT:
5313       evcvttpd2dqs(dst, src, vec_enc);
5314       evpmovdw(dst, dst, vec_enc);
5315       break;
5316     case T_BYTE:
5317       evcvttpd2dqs(dst, src, vec_enc);
5318       evpmovdb(dst, dst, vec_enc);
5319       break;
5320     default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (mem src): %s", type2name(to_elem_bt));
5321   }
5322 }
5323 
5324 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5325                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5326                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5327   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5328   // and re-instantiate original MXCSR.RC mode after that.
5329   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5330 
5331   mov64(tmp, julong_cast(0.5L));
5332   evpbroadcastq(xtmp1, tmp, vec_enc);
5333   vaddpd(xtmp1, src , xtmp1, vec_enc);
5334   evcvtpd2qq(dst, xtmp1, vec_enc);
5335   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5336                                                 double_sign_flip, vec_enc);;
5337 
5338   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5339 }
5340 
5341 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5342                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5343                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5344   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5345   // and re-instantiate original MXCSR.RC mode after that.
5346   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5347 
5348   movl(tmp, jint_cast(0.5));
5349   movq(xtmp1, tmp);
5350   vbroadcastss(xtmp1, xtmp1, vec_enc);
5351   vaddps(xtmp1, src , xtmp1, vec_enc);
5352   vcvtps2dq(dst, xtmp1, vec_enc);
5353   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5354                                               float_sign_flip, vec_enc);
5355 
5356   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5357 }
5358 
5359 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5360                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5361                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5362   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5363   // and re-instantiate original MXCSR.RC mode after that.
5364   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5365 
5366   movl(tmp, jint_cast(0.5));
5367   movq(xtmp1, tmp);
5368   vbroadcastss(xtmp1, xtmp1, vec_enc);
5369   vaddps(xtmp1, src , xtmp1, vec_enc);
5370   vcvtps2dq(dst, xtmp1, vec_enc);
5371   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5372 
5373   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5374 }
5375 
5376 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5377                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5378   switch (from_elem_bt) {
5379     case T_BYTE:
5380       switch (to_elem_bt) {
5381         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5382         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5383         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5384         default: ShouldNotReachHere();
5385       }
5386       break;
5387     case T_SHORT:
5388       switch (to_elem_bt) {
5389         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5390         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5391         default: ShouldNotReachHere();
5392       }
5393       break;
5394     case T_INT:
5395       assert(to_elem_bt == T_LONG, "");
5396       vpmovzxdq(dst, src, vlen_enc);
5397       break;
5398     default:
5399       ShouldNotReachHere();
5400   }
5401 }
5402 
5403 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5404                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5405   switch (from_elem_bt) {
5406     case T_BYTE:
5407       switch (to_elem_bt) {
5408         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5409         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5410         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5411         default: ShouldNotReachHere();
5412       }
5413       break;
5414     case T_SHORT:
5415       switch (to_elem_bt) {
5416         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5417         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5418         default: ShouldNotReachHere();
5419       }
5420       break;
5421     case T_INT:
5422       assert(to_elem_bt == T_LONG, "");
5423       vpmovsxdq(dst, src, vlen_enc);
5424       break;
5425     default:
5426       ShouldNotReachHere();
5427   }
5428 }
5429 
5430 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5431                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5432   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5433   assert(vlen_enc != AVX_512bit, "");
5434 
5435   int dst_bt_size = type2aelembytes(dst_bt);
5436   int src_bt_size = type2aelembytes(src_bt);
5437   if (dst_bt_size > src_bt_size) {
5438     switch (dst_bt_size / src_bt_size) {
5439       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5440       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5441       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5442       default: ShouldNotReachHere();
5443     }
5444   } else {
5445     assert(dst_bt_size < src_bt_size, "");
5446     switch (src_bt_size / dst_bt_size) {
5447       case 2: {
5448         if (vlen_enc == AVX_128bit) {
5449           vpacksswb(dst, src, src, vlen_enc);
5450         } else {
5451           vpacksswb(dst, src, src, vlen_enc);
5452           vpermq(dst, dst, 0x08, vlen_enc);
5453         }
5454         break;
5455       }
5456       case 4: {
5457         if (vlen_enc == AVX_128bit) {
5458           vpackssdw(dst, src, src, vlen_enc);
5459           vpacksswb(dst, dst, dst, vlen_enc);
5460         } else {
5461           vpackssdw(dst, src, src, vlen_enc);
5462           vpermq(dst, dst, 0x08, vlen_enc);
5463           vpacksswb(dst, dst, dst, AVX_128bit);
5464         }
5465         break;
5466       }
5467       case 8: {
5468         if (vlen_enc == AVX_128bit) {
5469           vpshufd(dst, src, 0x08, vlen_enc);
5470           vpackssdw(dst, dst, dst, vlen_enc);
5471           vpacksswb(dst, dst, dst, vlen_enc);
5472         } else {
5473           vpshufd(dst, src, 0x08, vlen_enc);
5474           vpermq(dst, dst, 0x08, vlen_enc);
5475           vpackssdw(dst, dst, dst, AVX_128bit);
5476           vpacksswb(dst, dst, dst, AVX_128bit);
5477         }
5478         break;
5479       }
5480       default: ShouldNotReachHere();
5481     }
5482   }
5483 }
5484 
5485 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5486                                    bool merge, BasicType bt, int vlen_enc) {
5487   if (bt == T_INT) {
5488     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5489   } else {
5490     assert(bt == T_LONG, "");
5491     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5492   }
5493 }
5494 
5495 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5496                                    bool merge, BasicType bt, int vlen_enc) {
5497   if (bt == T_INT) {
5498     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5499   } else {
5500     assert(bt == T_LONG, "");
5501     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5502   }
5503 }
5504 
5505 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5506                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5507                                                int vec_enc) {
5508   int index = 0;
5509   int vindex = 0;
5510   mov64(rtmp1, 0x0101010101010101L);
5511   pdepq(rtmp1, src, rtmp1);
5512   if (mask_len > 8) {
5513     movq(rtmp2, src);
5514     vpxor(xtmp, xtmp, xtmp, vec_enc);
5515     movq(xtmp, rtmp1);
5516   }
5517   movq(dst, rtmp1);
5518 
5519   mask_len -= 8;
5520   while (mask_len > 0) {
5521     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5522     index++;
5523     if ((index % 2) == 0) {
5524       pxor(xtmp, xtmp);
5525     }
5526     mov64(rtmp1, 0x0101010101010101L);
5527     shrq(rtmp2, 8);
5528     pdepq(rtmp1, rtmp2, rtmp1);
5529     pinsrq(xtmp, rtmp1, index % 2);
5530     vindex = index / 2;
5531     if (vindex) {
5532       // Write entire 16 byte vector when both 64 bit
5533       // lanes are update to save redundant instructions.
5534       if (index % 2) {
5535         vinsertf128(dst, dst, xtmp, vindex);
5536       }
5537     } else {
5538       vmovdqu(dst, xtmp);
5539     }
5540     mask_len -= 8;
5541   }
5542 }
5543 
5544 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5545   switch(opc) {
5546     case Op_VectorMaskTrueCount:
5547       popcntq(dst, tmp);
5548       break;
5549     case Op_VectorMaskLastTrue:
5550       if (VM_Version::supports_lzcnt()) {
5551         lzcntq(tmp, tmp);
5552         movl(dst, 63);
5553         subl(dst, tmp);
5554       } else {
5555         movl(dst, -1);
5556         bsrq(tmp, tmp);
5557         cmov32(Assembler::notZero, dst, tmp);
5558       }
5559       break;
5560     case Op_VectorMaskFirstTrue:
5561       if (VM_Version::supports_bmi1()) {
5562         if (masklen < 32) {
5563           orl(tmp, 1 << masklen);
5564           tzcntl(dst, tmp);
5565         } else if (masklen == 32) {
5566           tzcntl(dst, tmp);
5567         } else {
5568           assert(masklen == 64, "");
5569           tzcntq(dst, tmp);
5570         }
5571       } else {
5572         if (masklen < 32) {
5573           orl(tmp, 1 << masklen);
5574           bsfl(dst, tmp);
5575         } else {
5576           assert(masklen == 32 || masklen == 64, "");
5577           movl(dst, masklen);
5578           if (masklen == 32)  {
5579             bsfl(tmp, tmp);
5580           } else {
5581             bsfq(tmp, tmp);
5582           }
5583           cmov32(Assembler::notZero, dst, tmp);
5584         }
5585       }
5586       break;
5587     case Op_VectorMaskToLong:
5588       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5589       break;
5590     default: assert(false, "Unhandled mask operation");
5591   }
5592 }
5593 
5594 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5595                                               int masklen, int masksize, int vec_enc) {
5596   assert(VM_Version::supports_popcnt(), "");
5597 
5598   if(VM_Version::supports_avx512bw()) {
5599     kmovql(tmp, mask);
5600   } else {
5601     assert(masklen <= 16, "");
5602     kmovwl(tmp, mask);
5603   }
5604 
5605   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5606   // operations needs to be clipped.
5607   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5608     andq(tmp, (1 << masklen) - 1);
5609   }
5610 
5611   vector_mask_operation_helper(opc, dst, tmp, masklen);
5612 }
5613 
5614 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5615                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5616   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5617          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5618   assert(VM_Version::supports_popcnt(), "");
5619 
5620   bool need_clip = false;
5621   switch(bt) {
5622     case T_BOOLEAN:
5623       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5624       vpxor(xtmp, xtmp, xtmp, vec_enc);
5625       vpsubb(xtmp, xtmp, mask, vec_enc);
5626       vpmovmskb(tmp, xtmp, vec_enc);
5627       need_clip = masklen < 16;
5628       break;
5629     case T_BYTE:
5630       vpmovmskb(tmp, mask, vec_enc);
5631       need_clip = masklen < 16;
5632       break;
5633     case T_SHORT:
5634       vpacksswb(xtmp, mask, mask, vec_enc);
5635       if (masklen >= 16) {
5636         vpermpd(xtmp, xtmp, 8, vec_enc);
5637       }
5638       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5639       need_clip = masklen < 16;
5640       break;
5641     case T_INT:
5642     case T_FLOAT:
5643       vmovmskps(tmp, mask, vec_enc);
5644       need_clip = masklen < 4;
5645       break;
5646     case T_LONG:
5647     case T_DOUBLE:
5648       vmovmskpd(tmp, mask, vec_enc);
5649       need_clip = masklen < 2;
5650       break;
5651     default: assert(false, "Unhandled type, %s", type2name(bt));
5652   }
5653 
5654   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5655   // operations needs to be clipped.
5656   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5657     // need_clip implies masklen < 32
5658     andq(tmp, (1 << masklen) - 1);
5659   }
5660 
5661   vector_mask_operation_helper(opc, dst, tmp, masklen);
5662 }
5663 
5664 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5665                                              Register rtmp2, int mask_len) {
5666   kmov(rtmp1, src);
5667   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5668   mov64(rtmp2, -1L);
5669   pextq(rtmp2, rtmp2, rtmp1);
5670   kmov(dst, rtmp2);
5671 }
5672 
5673 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5674                                                     XMMRegister mask, Register rtmp, Register rscratch,
5675                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5676                                                     int vec_enc) {
5677   assert(type2aelembytes(bt) >= 4, "");
5678   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5679   address compress_perm_table = nullptr;
5680   address expand_perm_table = nullptr;
5681   if (type2aelembytes(bt) == 8) {
5682     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5683     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5684     vmovmskpd(rtmp, mask, vec_enc);
5685   } else {
5686     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5687     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5688     vmovmskps(rtmp, mask, vec_enc);
5689   }
5690   shlq(rtmp, 5); // for 32 byte permute row.
5691   if (opcode == Op_CompressV) {
5692     lea(rscratch, ExternalAddress(compress_perm_table));
5693   } else {
5694     lea(rscratch, ExternalAddress(expand_perm_table));
5695   }
5696   addptr(rtmp, rscratch);
5697   vmovdqu(permv, Address(rtmp));
5698   vpermps(dst, permv, src, Assembler::AVX_256bit);
5699   vpxor(xtmp, xtmp, xtmp, vec_enc);
5700   // Blend the result with zero vector using permute mask, each column entry
5701   // in a permute table row contains either a valid permute index or a -1 (default)
5702   // value, this can potentially be used as a blending mask after
5703   // compressing/expanding the source vector lanes.
5704   vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv);
5705 }
5706 
5707 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5708                                                bool merge, BasicType bt, int vec_enc) {
5709   if (opcode == Op_CompressV) {
5710     switch(bt) {
5711     case T_BYTE:
5712       evpcompressb(dst, mask, src, merge, vec_enc);
5713       break;
5714     case T_CHAR:
5715     case T_SHORT:
5716       evpcompressw(dst, mask, src, merge, vec_enc);
5717       break;
5718     case T_INT:
5719       evpcompressd(dst, mask, src, merge, vec_enc);
5720       break;
5721     case T_FLOAT:
5722       evcompressps(dst, mask, src, merge, vec_enc);
5723       break;
5724     case T_LONG:
5725       evpcompressq(dst, mask, src, merge, vec_enc);
5726       break;
5727     case T_DOUBLE:
5728       evcompresspd(dst, mask, src, merge, vec_enc);
5729       break;
5730     default:
5731       fatal("Unsupported type %s", type2name(bt));
5732       break;
5733     }
5734   } else {
5735     assert(opcode == Op_ExpandV, "");
5736     switch(bt) {
5737     case T_BYTE:
5738       evpexpandb(dst, mask, src, merge, vec_enc);
5739       break;
5740     case T_CHAR:
5741     case T_SHORT:
5742       evpexpandw(dst, mask, src, merge, vec_enc);
5743       break;
5744     case T_INT:
5745       evpexpandd(dst, mask, src, merge, vec_enc);
5746       break;
5747     case T_FLOAT:
5748       evexpandps(dst, mask, src, merge, vec_enc);
5749       break;
5750     case T_LONG:
5751       evpexpandq(dst, mask, src, merge, vec_enc);
5752       break;
5753     case T_DOUBLE:
5754       evexpandpd(dst, mask, src, merge, vec_enc);
5755       break;
5756     default:
5757       fatal("Unsupported type %s", type2name(bt));
5758       break;
5759     }
5760   }
5761 }
5762 
5763 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5764                                            KRegister ktmp1, int vec_enc) {
5765   if (opcode == Op_SignumVD) {
5766     vsubpd(dst, zero, one, vec_enc);
5767     // if src < 0 ? -1 : 1
5768     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5769     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5770     // if src == NaN, -0.0 or 0.0 return src.
5771     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5772     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5773   } else {
5774     assert(opcode == Op_SignumVF, "");
5775     vsubps(dst, zero, one, vec_enc);
5776     // if src < 0 ? -1 : 1
5777     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5778     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5779     // if src == NaN, -0.0 or 0.0 return src.
5780     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5781     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5782   }
5783 }
5784 
5785 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5786                                           XMMRegister xtmp1, int vec_enc) {
5787   if (opcode == Op_SignumVD) {
5788     vsubpd(dst, zero, one, vec_enc);
5789     // if src < 0 ? -1 : 1
5790     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5791     // if src == NaN, -0.0 or 0.0 return src.
5792     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5793     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5794   } else {
5795     assert(opcode == Op_SignumVF, "");
5796     vsubps(dst, zero, one, vec_enc);
5797     // if src < 0 ? -1 : 1
5798     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5799     // if src == NaN, -0.0 or 0.0 return src.
5800     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5801     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5802   }
5803 }
5804 
5805 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5806   if (VM_Version::supports_avx512bw()) {
5807     if (mask_len > 32) {
5808       kmovql(dst, src);
5809     } else {
5810       kmovdl(dst, src);
5811       if (mask_len != 32) {
5812         kshiftrdl(dst, dst, 32 - mask_len);
5813       }
5814     }
5815   } else {
5816     assert(mask_len <= 16, "");
5817     kmovwl(dst, src);
5818     if (mask_len != 16) {
5819       kshiftrwl(dst, dst, 16 - mask_len);
5820     }
5821   }
5822 }
5823 
5824 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5825   int lane_size = type2aelembytes(bt);
5826   if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5827       (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) {
5828     movptr(rtmp, imm32);
5829     switch(lane_size) {
5830       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5831       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5832       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5833       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5834       fatal("Unsupported lane size %d", lane_size);
5835       break;
5836     }
5837   } else {
5838     movptr(rtmp, imm32);
5839     movq(dst, rtmp);
5840     switch(lane_size) {
5841       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5842       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5843       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5844       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5845       fatal("Unsupported lane size %d", lane_size);
5846       break;
5847     }
5848   }
5849 }
5850 
5851 //
5852 // Following is lookup table based popcount computation algorithm:-
5853 //       Index   Bit set count
5854 //     [ 0000 ->   0,
5855 //       0001 ->   1,
5856 //       0010 ->   1,
5857 //       0011 ->   2,
5858 //       0100 ->   1,
5859 //       0101 ->   2,
5860 //       0110 ->   2,
5861 //       0111 ->   3,
5862 //       1000 ->   1,
5863 //       1001 ->   2,
5864 //       1010 ->   3,
5865 //       1011 ->   3,
5866 //       1100 ->   2,
5867 //       1101 ->   3,
5868 //       1111 ->   4 ]
5869 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5870 //     shuffle indices for lookup table access.
5871 //  b. Right shift each byte of vector lane by 4 positions.
5872 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5873 //     shuffle indices for lookup table access.
5874 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5875 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5876 //     count of all the bytes of a quadword.
5877 //  f. Perform step e. for upper 128bit vector lane.
5878 //  g. Pack the bitset count of quadwords back to double word.
5879 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5880 
5881 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5882                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5883   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5884   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5885   vpsrlw(dst, src, 4, vec_enc);
5886   vpand(dst, dst, xtmp1, vec_enc);
5887   vpand(xtmp1, src, xtmp1, vec_enc);
5888   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5889   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5890   vpshufb(dst, xtmp2, dst, vec_enc);
5891   vpaddb(dst, dst, xtmp1, vec_enc);
5892 }
5893 
5894 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5895                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5896   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5897   // Following code is as per steps e,f,g and h of above algorithm.
5898   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5899   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5900   vpsadbw(dst, dst, xtmp2, vec_enc);
5901   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5902   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5903   vpackuswb(dst, xtmp1, dst, vec_enc);
5904 }
5905 
5906 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5907                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5908   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5909   // Add the popcount of upper and lower bytes of word.
5910   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5911   vpsrlw(dst, xtmp1, 8, vec_enc);
5912   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5913   vpaddw(dst, dst, xtmp1, vec_enc);
5914 }
5915 
5916 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5917                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5918   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5919   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5920   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5921 }
5922 
5923 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5924                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5925   switch(bt) {
5926     case T_LONG:
5927       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5928       break;
5929     case T_INT:
5930       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5931       break;
5932     case T_CHAR:
5933     case T_SHORT:
5934       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5935       break;
5936     case T_BYTE:
5937     case T_BOOLEAN:
5938       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5939       break;
5940     default:
5941       fatal("Unsupported type %s", type2name(bt));
5942       break;
5943   }
5944 }
5945 
5946 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5947                                                       KRegister mask, bool merge, int vec_enc) {
5948   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5949   switch(bt) {
5950     case T_LONG:
5951       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5952       evpopcntq(dst, mask, src, merge, vec_enc);
5953       break;
5954     case T_INT:
5955       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5956       evpopcntd(dst, mask, src, merge, vec_enc);
5957       break;
5958     case T_CHAR:
5959     case T_SHORT:
5960       assert(VM_Version::supports_avx512_bitalg(), "");
5961       evpopcntw(dst, mask, src, merge, vec_enc);
5962       break;
5963     case T_BYTE:
5964     case T_BOOLEAN:
5965       assert(VM_Version::supports_avx512_bitalg(), "");
5966       evpopcntb(dst, mask, src, merge, vec_enc);
5967       break;
5968     default:
5969       fatal("Unsupported type %s", type2name(bt));
5970       break;
5971   }
5972 }
5973 
5974 // Bit reversal algorithm first reverses the bits of each byte followed by
5975 // a byte level reversal for multi-byte primitive types (short/int/long).
5976 // Algorithm performs a lookup table access to get reverse bit sequence
5977 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5978 // is obtained by swapping the reverse bit sequences of upper and lower
5979 // nibble of a byte.
5980 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5981                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5982   if (VM_Version::supports_avx512vlbw()) {
5983 
5984     // Get the reverse bit sequence of lower nibble of each byte.
5985     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5986     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5987     evpandq(dst, xtmp2, src, vec_enc);
5988     vpshufb(dst, xtmp1, dst, vec_enc);
5989     vpsllq(dst, dst, 4, vec_enc);
5990 
5991     // Get the reverse bit sequence of upper nibble of each byte.
5992     vpandn(xtmp2, xtmp2, src, vec_enc);
5993     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5994     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5995 
5996     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5997     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5998     evporq(xtmp2, dst, xtmp2, vec_enc);
5999     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6000 
6001   } else if(vec_enc == Assembler::AVX_512bit) {
6002     // Shift based bit reversal.
6003     assert(bt == T_LONG || bt == T_INT, "");
6004 
6005     // Swap lower and upper nibble of each byte.
6006     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
6007 
6008     // Swap two least and most significant bits of each nibble.
6009     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
6010 
6011     // Swap adjacent pair of bits.
6012     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6013     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
6014 
6015     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6016     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
6017   } else {
6018     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
6019     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6020 
6021     // Get the reverse bit sequence of lower nibble of each byte.
6022     vpand(dst, xtmp2, src, vec_enc);
6023     vpshufb(dst, xtmp1, dst, vec_enc);
6024     vpsllq(dst, dst, 4, vec_enc);
6025 
6026     // Get the reverse bit sequence of upper nibble of each byte.
6027     vpandn(xtmp2, xtmp2, src, vec_enc);
6028     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6029     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6030 
6031     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6032     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6033     vpor(xtmp2, dst, xtmp2, vec_enc);
6034     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6035   }
6036 }
6037 
6038 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
6039                                                 XMMRegister xtmp, Register rscratch) {
6040   assert(VM_Version::supports_gfni(), "");
6041   assert(rscratch != noreg || always_reachable(mask), "missing");
6042 
6043   // Galois field instruction based bit reversal based on following algorithm.
6044   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6045   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
6046   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
6047   vector_reverse_byte(bt, dst, xtmp, vec_enc);
6048 }
6049 
6050 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
6051                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
6052   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
6053   evpandq(dst, xtmp1, src, vec_enc);
6054   vpsllq(dst, dst, nbits, vec_enc);
6055   vpandn(xtmp1, xtmp1, src, vec_enc);
6056   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
6057   evporq(dst, dst, xtmp1, vec_enc);
6058 }
6059 
6060 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6061                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
6062   // Shift based bit reversal.
6063   assert(VM_Version::supports_evex(), "");
6064   switch(bt) {
6065     case T_LONG:
6066       // Swap upper and lower double word of each quad word.
6067       evprorq(xtmp1, k0, src, 32, true, vec_enc);
6068       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
6069       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6070       break;
6071     case T_INT:
6072       // Swap upper and lower word of each double word.
6073       evprord(xtmp1, k0, src, 16, true, vec_enc);
6074       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6075       break;
6076     case T_CHAR:
6077     case T_SHORT:
6078       // Swap upper and lower byte of each word.
6079       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6080       break;
6081     case T_BYTE:
6082       evmovdquq(dst, k0, src, true, vec_enc);
6083       break;
6084     default:
6085       fatal("Unsupported type %s", type2name(bt));
6086       break;
6087   }
6088 }
6089 
6090 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6091   if (bt == T_BYTE) {
6092     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6093       evmovdquq(dst, k0, src, true, vec_enc);
6094     } else {
6095       vmovdqu(dst, src);
6096     }
6097     return;
6098   }
6099   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6100   // pre-computed shuffle indices.
6101   switch(bt) {
6102     case T_LONG:
6103       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6104       break;
6105     case T_INT:
6106       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6107       break;
6108     case T_CHAR:
6109     case T_SHORT:
6110       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6111       break;
6112     default:
6113       fatal("Unsupported type %s", type2name(bt));
6114       break;
6115   }
6116   vpshufb(dst, src, dst, vec_enc);
6117 }
6118 
6119 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6120                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6121                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6122   assert(is_integral_type(bt), "");
6123   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6124   assert(VM_Version::supports_avx512cd(), "");
6125   switch(bt) {
6126     case T_LONG:
6127       evplzcntq(dst, ktmp, src, merge, vec_enc);
6128       break;
6129     case T_INT:
6130       evplzcntd(dst, ktmp, src, merge, vec_enc);
6131       break;
6132     case T_SHORT:
6133       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6134       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6135       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6136       vpunpckhwd(dst, xtmp1, src, vec_enc);
6137       evplzcntd(dst, ktmp, dst, merge, vec_enc);
6138       vpackusdw(dst, xtmp2, dst, vec_enc);
6139       break;
6140     case T_BYTE:
6141       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6142       // accessing the lookup table.
6143       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6144       // accessing the lookup table.
6145       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6146       assert(VM_Version::supports_avx512bw(), "");
6147       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6148       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6149       vpand(xtmp2, dst, src, vec_enc);
6150       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6151       vpsrlw(xtmp3, src, 4, vec_enc);
6152       vpand(xtmp3, dst, xtmp3, vec_enc);
6153       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6154       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6155       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6156       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6157       break;
6158     default:
6159       fatal("Unsupported type %s", type2name(bt));
6160       break;
6161   }
6162 }
6163 
6164 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6165                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6166   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6167   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6168   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6169   // accessing the lookup table.
6170   vpand(dst, xtmp2, src, vec_enc);
6171   vpshufb(dst, xtmp1, dst, vec_enc);
6172   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6173   // accessing the lookup table.
6174   vpsrlw(xtmp3, src, 4, vec_enc);
6175   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6176   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6177   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6178   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6179   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6180   vpaddb(dst, dst, xtmp2, vec_enc);
6181   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6182 }
6183 
6184 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6185                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6186   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6187   // Add zero counts of lower byte and upper byte of a word if
6188   // upper byte holds a zero value.
6189   vpsrlw(xtmp3, src, 8, vec_enc);
6190   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6191   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6192   vpsllw(xtmp2, dst, 8, vec_enc);
6193   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6194   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6195   vpsrlw(dst, dst, 8, vec_enc);
6196 }
6197 
6198 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6199                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6200   // By converting the integer to a float, we can obtain the number of leading zeros based on the exponent of the float.
6201   // As the float exponent contains a bias of 127 for nonzero values, the bias must be removed before interpreting the
6202   // exponent as the leading zero count.
6203 
6204   // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher
6205   // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit
6206   // contributes to the leading number of zeros.
6207   vpsrld(dst, src, 1, vec_enc);
6208   vpandn(dst, dst, src, vec_enc);
6209 
6210   vcvtdq2ps(dst, dst, vec_enc);
6211 
6212   // By comparing the register to itself, all the bits in the destination are set.
6213   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6214 
6215   // Move the biased exponent to the low end of the lane and mask with 0xFF to discard the sign bit.
6216   vpsrld(xtmp2, xtmp1, 24, vec_enc);
6217   vpsrld(dst, dst, 23, vec_enc);
6218   vpand(dst, xtmp2, dst, vec_enc);
6219 
6220   // Subtract 127 from the exponent, which removes the bias from the exponent.
6221   vpsrld(xtmp2, xtmp1, 25, vec_enc);
6222   vpsubd(dst, dst, xtmp2, vec_enc);
6223 
6224   vpsrld(xtmp2, xtmp1, 27, vec_enc);
6225 
6226   // If the original value is 0 the exponent would not have bias, so the subtraction creates a negative number. If this
6227   // is found in any of the lanes, replace the lane with -1 from xtmp1.
6228   vblendvps(dst, dst, xtmp1, dst, vec_enc, true, xtmp3);
6229 
6230   // If the original value is negative, replace the lane with 31.
6231   vblendvps(dst, dst, xtmp2, src, vec_enc, true, xtmp3);
6232 
6233   // Subtract the exponent from 31, giving the final result. For 0, the result is 32 as the exponent was replaced with -1,
6234   // and for negative numbers the result is 0 as the exponent was replaced with 31.
6235   vpsubd(dst, xtmp2, dst, vec_enc);
6236 }
6237 
6238 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6239                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6240   // Find the leading zeros of the top and bottom halves of the long individually.
6241   vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6242 
6243   // Move the top half result to the bottom half of xtmp1, setting the top half to 0.
6244   vpsrlq(xtmp1, dst, 32, vec_enc);
6245   // By moving the top half result to the right by 6 bits, if the top half was empty (i.e. 32 is returned) the result bit will
6246   // be in the most significant position of the bottom half.
6247   vpsrlq(xtmp2, dst, 6, vec_enc);
6248 
6249   // In the bottom half, add the top half and bottom half results.
6250   vpaddq(dst, xtmp1, dst, vec_enc);
6251 
6252   // For the bottom half, choose between the values using the most significant bit of xtmp2.
6253   // If the MSB is set, then bottom+top in dst is the resulting value. If the top half is less than 32 xtmp1 is chosen,
6254   // which contains only the top half result.
6255   // In the top half the MSB is always zero, so the value in xtmp1 is always chosen. This value is always 0, which clears
6256   // the lane as required.
6257   vblendvps(dst, xtmp1, dst, xtmp2, vec_enc, true, xtmp3);
6258 }
6259 
6260 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6261                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6262                                                        Register rtmp, int vec_enc) {
6263   assert(is_integral_type(bt), "unexpected type");
6264   assert(vec_enc < Assembler::AVX_512bit, "");
6265   switch(bt) {
6266     case T_LONG:
6267       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6268       break;
6269     case T_INT:
6270       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6271       break;
6272     case T_SHORT:
6273       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6274       break;
6275     case T_BYTE:
6276       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6277       break;
6278     default:
6279       fatal("Unsupported type %s", type2name(bt));
6280       break;
6281   }
6282 }
6283 
6284 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6285   switch(bt) {
6286     case T_BYTE:
6287       vpsubb(dst, src1, src2, vec_enc);
6288       break;
6289     case T_SHORT:
6290       vpsubw(dst, src1, src2, vec_enc);
6291       break;
6292     case T_INT:
6293       vpsubd(dst, src1, src2, vec_enc);
6294       break;
6295     case T_LONG:
6296       vpsubq(dst, src1, src2, vec_enc);
6297       break;
6298     default:
6299       fatal("Unsupported type %s", type2name(bt));
6300       break;
6301   }
6302 }
6303 
6304 // Trailing zero count computation is based on leading zero count operation as per
6305 // following equation. All AVX3 targets support AVX512CD feature which offers
6306 // direct vector instruction to compute leading zero count.
6307 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6308 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6309                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6310                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6311   assert(is_integral_type(bt), "");
6312   // xtmp = -1
6313   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6314   // xtmp = xtmp + src
6315   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6316   // xtmp = xtmp & ~src
6317   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6318   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6319   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6320   vpsub(bt, dst, xtmp4, dst, vec_enc);
6321 }
6322 
6323 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6324 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6325 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6326                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6327   assert(is_integral_type(bt), "");
6328   // xtmp = 0
6329   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6330   // xtmp = 0 - src
6331   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6332   // xtmp = xtmp | src
6333   vpor(xtmp3, xtmp3, src, vec_enc);
6334   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6335   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6336   vpsub(bt, dst, xtmp1, dst, vec_enc);
6337 }
6338 
6339 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6340   Label done;
6341   Label neg_divisor_fastpath;
6342   cmpl(divisor, 0);
6343   jccb(Assembler::less, neg_divisor_fastpath);
6344   xorl(rdx, rdx);
6345   divl(divisor);
6346   jmpb(done);
6347   bind(neg_divisor_fastpath);
6348   // Fastpath for divisor < 0:
6349   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6350   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6351   movl(rdx, rax);
6352   subl(rdx, divisor);
6353   if (VM_Version::supports_bmi1()) {
6354     andnl(rax, rdx, rax);
6355   } else {
6356     notl(rdx);
6357     andl(rax, rdx);
6358   }
6359   shrl(rax, 31);
6360   bind(done);
6361 }
6362 
6363 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6364   Label done;
6365   Label neg_divisor_fastpath;
6366   cmpl(divisor, 0);
6367   jccb(Assembler::less, neg_divisor_fastpath);
6368   xorl(rdx, rdx);
6369   divl(divisor);
6370   jmpb(done);
6371   bind(neg_divisor_fastpath);
6372   // Fastpath when divisor < 0:
6373   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6374   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6375   movl(rdx, rax);
6376   subl(rax, divisor);
6377   if (VM_Version::supports_bmi1()) {
6378     andnl(rax, rax, rdx);
6379   } else {
6380     notl(rax);
6381     andl(rax, rdx);
6382   }
6383   sarl(rax, 31);
6384   andl(rax, divisor);
6385   subl(rdx, rax);
6386   bind(done);
6387 }
6388 
6389 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6390   Label done;
6391   Label neg_divisor_fastpath;
6392 
6393   cmpl(divisor, 0);
6394   jccb(Assembler::less, neg_divisor_fastpath);
6395   xorl(rdx, rdx);
6396   divl(divisor);
6397   jmpb(done);
6398   bind(neg_divisor_fastpath);
6399   // Fastpath for divisor < 0:
6400   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6401   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6402   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6403   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6404   movl(rdx, rax);
6405   subl(rax, divisor);
6406   if (VM_Version::supports_bmi1()) {
6407     andnl(rax, rax, rdx);
6408   } else {
6409     notl(rax);
6410     andl(rax, rdx);
6411   }
6412   movl(tmp, rax);
6413   shrl(rax, 31); // quotient
6414   sarl(tmp, 31);
6415   andl(tmp, divisor);
6416   subl(rdx, tmp); // remainder
6417   bind(done);
6418 }
6419 
6420 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6421                                  XMMRegister xtmp2, Register rtmp) {
6422   if(VM_Version::supports_gfni()) {
6423     // Galois field instruction based bit reversal based on following algorithm.
6424     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6425     mov64(rtmp, 0x8040201008040201L);
6426     movq(xtmp1, src);
6427     movq(xtmp2, rtmp);
6428     gf2p8affineqb(xtmp1, xtmp2, 0);
6429     movq(dst, xtmp1);
6430   } else {
6431     // Swap even and odd numbered bits.
6432     movl(rtmp, src);
6433     andl(rtmp, 0x55555555);
6434     shll(rtmp, 1);
6435     movl(dst, src);
6436     andl(dst, 0xAAAAAAAA);
6437     shrl(dst, 1);
6438     orl(dst, rtmp);
6439 
6440     // Swap LSB and MSB 2 bits of each nibble.
6441     movl(rtmp, dst);
6442     andl(rtmp, 0x33333333);
6443     shll(rtmp, 2);
6444     andl(dst, 0xCCCCCCCC);
6445     shrl(dst, 2);
6446     orl(dst, rtmp);
6447 
6448     // Swap LSB and MSB 4 bits of each byte.
6449     movl(rtmp, dst);
6450     andl(rtmp, 0x0F0F0F0F);
6451     shll(rtmp, 4);
6452     andl(dst, 0xF0F0F0F0);
6453     shrl(dst, 4);
6454     orl(dst, rtmp);
6455   }
6456   bswapl(dst);
6457 }
6458 
6459 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6460                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6461   if(VM_Version::supports_gfni()) {
6462     // Galois field instruction based bit reversal based on following algorithm.
6463     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6464     mov64(rtmp1, 0x8040201008040201L);
6465     movq(xtmp1, src);
6466     movq(xtmp2, rtmp1);
6467     gf2p8affineqb(xtmp1, xtmp2, 0);
6468     movq(dst, xtmp1);
6469   } else {
6470     // Swap even and odd numbered bits.
6471     movq(rtmp1, src);
6472     mov64(rtmp2, 0x5555555555555555L);
6473     andq(rtmp1, rtmp2);
6474     shlq(rtmp1, 1);
6475     movq(dst, src);
6476     notq(rtmp2);
6477     andq(dst, rtmp2);
6478     shrq(dst, 1);
6479     orq(dst, rtmp1);
6480 
6481     // Swap LSB and MSB 2 bits of each nibble.
6482     movq(rtmp1, dst);
6483     mov64(rtmp2, 0x3333333333333333L);
6484     andq(rtmp1, rtmp2);
6485     shlq(rtmp1, 2);
6486     notq(rtmp2);
6487     andq(dst, rtmp2);
6488     shrq(dst, 2);
6489     orq(dst, rtmp1);
6490 
6491     // Swap LSB and MSB 4 bits of each byte.
6492     movq(rtmp1, dst);
6493     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6494     andq(rtmp1, rtmp2);
6495     shlq(rtmp1, 4);
6496     notq(rtmp2);
6497     andq(dst, rtmp2);
6498     shrq(dst, 4);
6499     orq(dst, rtmp1);
6500   }
6501   bswapq(dst);
6502 }
6503 
6504 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6505   Label done;
6506   Label neg_divisor_fastpath;
6507   cmpq(divisor, 0);
6508   jccb(Assembler::less, neg_divisor_fastpath);
6509   xorl(rdx, rdx);
6510   divq(divisor);
6511   jmpb(done);
6512   bind(neg_divisor_fastpath);
6513   // Fastpath for divisor < 0:
6514   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6515   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6516   movq(rdx, rax);
6517   subq(rdx, divisor);
6518   if (VM_Version::supports_bmi1()) {
6519     andnq(rax, rdx, rax);
6520   } else {
6521     notq(rdx);
6522     andq(rax, rdx);
6523   }
6524   shrq(rax, 63);
6525   bind(done);
6526 }
6527 
6528 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6529   Label done;
6530   Label neg_divisor_fastpath;
6531   cmpq(divisor, 0);
6532   jccb(Assembler::less, neg_divisor_fastpath);
6533   xorq(rdx, rdx);
6534   divq(divisor);
6535   jmp(done);
6536   bind(neg_divisor_fastpath);
6537   // Fastpath when divisor < 0:
6538   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6539   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6540   movq(rdx, rax);
6541   subq(rax, divisor);
6542   if (VM_Version::supports_bmi1()) {
6543     andnq(rax, rax, rdx);
6544   } else {
6545     notq(rax);
6546     andq(rax, rdx);
6547   }
6548   sarq(rax, 63);
6549   andq(rax, divisor);
6550   subq(rdx, rax);
6551   bind(done);
6552 }
6553 
6554 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6555   Label done;
6556   Label neg_divisor_fastpath;
6557   cmpq(divisor, 0);
6558   jccb(Assembler::less, neg_divisor_fastpath);
6559   xorq(rdx, rdx);
6560   divq(divisor);
6561   jmp(done);
6562   bind(neg_divisor_fastpath);
6563   // Fastpath for divisor < 0:
6564   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6565   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6566   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6567   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6568   movq(rdx, rax);
6569   subq(rax, divisor);
6570   if (VM_Version::supports_bmi1()) {
6571     andnq(rax, rax, rdx);
6572   } else {
6573     notq(rax);
6574     andq(rax, rdx);
6575   }
6576   movq(tmp, rax);
6577   shrq(rax, 63); // quotient
6578   sarq(tmp, 63);
6579   andq(tmp, divisor);
6580   subq(rdx, tmp); // remainder
6581   bind(done);
6582 }
6583 
6584 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6585                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6586                                         int vlen_enc) {
6587   assert(VM_Version::supports_avx512bw(), "");
6588   // Byte shuffles are inlane operations and indices are determined using
6589   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6590   // normalized to index range 0-15. This makes sure that all the multiples
6591   // of an index value are placed at same relative position in 128 bit
6592   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6593   // will be 16th element in their respective 128 bit lanes.
6594   movl(rtmp, 16);
6595   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6596 
6597   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6598   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6599   // original shuffle indices and move the shuffled lanes corresponding to true
6600   // mask to destination vector.
6601   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6602   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6603   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6604 
6605   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6606   // and broadcasting second 128 bit lane.
6607   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6608   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6609   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6610   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6611   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6612 
6613   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6614   // and broadcasting third 128 bit lane.
6615   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6616   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6617   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6618   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6619   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6620 
6621   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6622   // and broadcasting third 128 bit lane.
6623   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6624   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6625   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6626   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6627   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6628 }
6629 
6630 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6631                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6632   if (vlen_enc == AVX_128bit) {
6633     vpermilps(dst, src, shuffle, vlen_enc);
6634   } else if (bt == T_INT) {
6635     vpermd(dst, shuffle, src, vlen_enc);
6636   } else {
6637     assert(bt == T_FLOAT, "");
6638     vpermps(dst, shuffle, src, vlen_enc);
6639   }
6640 }
6641 
6642 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6643   switch(opcode) {
6644     case Op_AddHF: vaddsh(dst, src1, src2); break;
6645     case Op_SubHF: vsubsh(dst, src1, src2); break;
6646     case Op_MulHF: vmulsh(dst, src1, src2); break;
6647     case Op_DivHF: vdivsh(dst, src1, src2); break;
6648     default: assert(false, "%s", NodeClassNames[opcode]); break;
6649   }
6650 }
6651 
6652 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6653   switch(elem_bt) {
6654     case T_BYTE:
6655       if (ideal_opc == Op_SaturatingAddV) {
6656         vpaddsb(dst, src1, src2, vlen_enc);
6657       } else {
6658         assert(ideal_opc == Op_SaturatingSubV, "");
6659         vpsubsb(dst, src1, src2, vlen_enc);
6660       }
6661       break;
6662     case T_SHORT:
6663       if (ideal_opc == Op_SaturatingAddV) {
6664         vpaddsw(dst, src1, src2, vlen_enc);
6665       } else {
6666         assert(ideal_opc == Op_SaturatingSubV, "");
6667         vpsubsw(dst, src1, src2, vlen_enc);
6668       }
6669       break;
6670     default:
6671       fatal("Unsupported type %s", type2name(elem_bt));
6672       break;
6673   }
6674 }
6675 
6676 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6677   switch(elem_bt) {
6678     case T_BYTE:
6679       if (ideal_opc == Op_SaturatingAddV) {
6680         vpaddusb(dst, src1, src2, vlen_enc);
6681       } else {
6682         assert(ideal_opc == Op_SaturatingSubV, "");
6683         vpsubusb(dst, src1, src2, vlen_enc);
6684       }
6685       break;
6686     case T_SHORT:
6687       if (ideal_opc == Op_SaturatingAddV) {
6688         vpaddusw(dst, src1, src2, vlen_enc);
6689       } else {
6690         assert(ideal_opc == Op_SaturatingSubV, "");
6691         vpsubusw(dst, src1, src2, vlen_enc);
6692       }
6693       break;
6694     default:
6695       fatal("Unsupported type %s", type2name(elem_bt));
6696       break;
6697   }
6698 }
6699 
6700 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6701                                                               XMMRegister src2, KRegister ktmp, int vlen_enc) {
6702   // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6703   // overflow_mask = Inp1 <u Inp2
6704   evpcmpu(elem_bt, ktmp,  src2, src1, Assembler::lt, vlen_enc);
6705   // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6706   evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6707 }
6708 
6709 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6710                                                               XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6711   // Emulate unsigned comparison using signed comparison
6712   // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6713   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6714   vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6715   vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6716 
6717   vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6718 
6719   // Res = INP1 - INP2 (non-commutative and non-associative)
6720   vpsub(elem_bt, dst, src1, src2, vlen_enc);
6721   // Res = Mask ? Zero : Res
6722   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6723   vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6724 }
6725 
6726 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6727                                                                XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6728   // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6729   // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6730   // Res = Signed Add INP1, INP2
6731   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6732   // T1 = SRC1 | SRC2
6733   vpor(xtmp1, src1, src2, vlen_enc);
6734   // Max_Unsigned = -1
6735   vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6736   // Unsigned compare:  Mask = Res <u T1
6737   evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6738   // res  = Mask ? Max_Unsigned : Res
6739   evpblend(elem_bt, dst, ktmp,  dst, xtmp2, true, vlen_enc);
6740 }
6741 
6742 //
6743 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6744 // unsigned addition operation.
6745 //    overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6746 //
6747 // We empirically determined its semantic equivalence to following reduced expression
6748 //    overflow_mask =  (a + b) <u (a | b)
6749 //
6750 // and also verified it though Alive2 solver.
6751 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6752 //
6753 
6754 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6755                                                               XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6756   // Res = Signed Add INP1, INP2
6757   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6758   // Compute T1 = INP1 | INP2
6759   vpor(xtmp3, src1, src2, vlen_enc);
6760   // T1 = Minimum signed value.
6761   vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6762   // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6763   vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6764   // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6765   vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6766   // Compute overflow detection mask = Res<1> <s T1
6767   if (elem_bt == T_INT) {
6768     vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6769   } else {
6770     assert(elem_bt == T_LONG, "");
6771     vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6772   }
6773   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6774 }
6775 
6776 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6777                                       int vlen_enc, bool xtmp2_hold_M1) {
6778   if (VM_Version::supports_avx512dq()) {
6779     evpmovq2m(ktmp, src, vlen_enc);
6780   } else {
6781     assert(VM_Version::supports_evex(), "");
6782     if (!xtmp2_hold_M1) {
6783       vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6784     }
6785     evpsraq(xtmp1, src, 63, vlen_enc);
6786     evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6787   }
6788 }
6789 
6790 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6791                                       int vlen_enc, bool xtmp2_hold_M1) {
6792   if (VM_Version::supports_avx512dq()) {
6793     evpmovd2m(ktmp, src, vlen_enc);
6794   } else {
6795     assert(VM_Version::supports_evex(), "");
6796     if (!xtmp2_hold_M1) {
6797       vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6798     }
6799     vpsrad(xtmp1, src, 31, vlen_enc);
6800     Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6801   }
6802 }
6803 
6804 
6805 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6806   if (elem_bt == T_LONG) {
6807     if (VM_Version::supports_evex()) {
6808       evpsraq(dst, src, 63, vlen_enc);
6809     } else {
6810       vpsrad(dst, src, 31, vlen_enc);
6811       vpshufd(dst, dst, 0xF5, vlen_enc);
6812     }
6813   } else {
6814     assert(elem_bt == T_INT, "");
6815     vpsrad(dst, src, 31, vlen_enc);
6816   }
6817 }
6818 
6819 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6820   if (compute_allones) {
6821     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6822       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6823     } else {
6824       vpcmpeqq(allones, allones, allones, vlen_enc);
6825     }
6826   }
6827   if (elem_bt == T_LONG) {
6828     vpsrlq(dst, allones, 1, vlen_enc);
6829   } else {
6830     assert(elem_bt == T_INT, "");
6831     vpsrld(dst, allones, 1, vlen_enc);
6832   }
6833 }
6834 
6835 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6836   if (compute_allones) {
6837     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6838       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6839     } else {
6840       vpcmpeqq(allones, allones, allones, vlen_enc);
6841     }
6842   }
6843   if (elem_bt == T_LONG) {
6844     vpsllq(dst, allones, 63, vlen_enc);
6845   } else {
6846     assert(elem_bt == T_INT, "");
6847     vpslld(dst, allones, 31, vlen_enc);
6848   }
6849 }
6850 
6851 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask,  XMMRegister src1, XMMRegister src2,
6852                                 Assembler::ComparisonPredicate cond, int vlen_enc) {
6853   switch(elem_bt) {
6854     case T_LONG:  evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6855     case T_INT:   evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6856     case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6857     case T_BYTE:  evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6858     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6859   }
6860 }
6861 
6862 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6863   switch(elem_bt) {
6864     case  T_LONG:  vpcmpgtq(dst, src1, src2, vlen_enc); break;
6865     case  T_INT:   vpcmpgtd(dst, src1, src2, vlen_enc); break;
6866     case  T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6867     case  T_BYTE:  vpcmpgtb(dst, src1, src2, vlen_enc); break;
6868     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6869   }
6870 }
6871 
6872 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6873                                            XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6874   if (elem_bt == T_LONG) {
6875     evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6876   } else {
6877     assert(elem_bt == T_INT, "");
6878     evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6879   }
6880 }
6881 
6882 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6883                                                          XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6884                                                          KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6885   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6886   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6887   // Overflow detection based on Hacker's delight section 2-13.
6888   if (ideal_opc == Op_SaturatingAddV) {
6889     // res = src1 + src2
6890     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6891     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6892     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6893     vpxor(xtmp1, dst, src1, vlen_enc);
6894     vpxor(xtmp2, dst, src2, vlen_enc);
6895     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6896   } else {
6897     assert(ideal_opc == Op_SaturatingSubV, "");
6898     // res = src1 - src2
6899     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6900     // Overflow occurs when both inputs have opposite polarity and
6901     // result polarity does not comply with first input polarity.
6902     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6903     vpxor(xtmp1, src1, src2, vlen_enc);
6904     vpxor(xtmp2, dst, src1, vlen_enc);
6905     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6906   }
6907 
6908   // Compute overflow detection mask.
6909   evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6910   // Note: xtmp1 hold -1 in all its lanes after above call.
6911 
6912   // Compute mask based on first input polarity.
6913   evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6914 
6915   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6916   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6917 
6918   // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6919   // set bits in first input polarity mask holds a min value.
6920   evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6921   // Blend destination lanes with saturated values using overflow detection mask.
6922   evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6923 }
6924 
6925 
6926 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6927                                                         XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6928                                                         XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6929   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6930   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6931   // Overflow detection based on Hacker's delight section 2-13.
6932   if (ideal_opc == Op_SaturatingAddV) {
6933     // res = src1 + src2
6934     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6935     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6936     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6937     vpxor(xtmp1, dst, src1, vlen_enc);
6938     vpxor(xtmp2, dst, src2, vlen_enc);
6939     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6940   } else {
6941     assert(ideal_opc == Op_SaturatingSubV, "");
6942     // res = src1 - src2
6943     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6944     // Overflow occurs when both inputs have opposite polarity and
6945     // result polarity does not comply with first input polarity.
6946     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6947     vpxor(xtmp1, src1, src2, vlen_enc);
6948     vpxor(xtmp2, dst, src1, vlen_enc);
6949     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6950   }
6951 
6952   // Sign-extend to compute overflow detection mask.
6953   vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
6954 
6955   vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
6956   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
6957   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6958 
6959   // Compose saturating min/max vector using first input polarity mask.
6960   vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
6961   vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
6962 
6963   // Blend result with saturating vector using overflow detection mask.
6964   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6965 }
6966 
6967 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6968   switch(elem_bt) {
6969     case T_BYTE:
6970       if (ideal_opc == Op_SaturatingAddV) {
6971         vpaddsb(dst, src1, src2, vlen_enc);
6972       } else {
6973         assert(ideal_opc == Op_SaturatingSubV, "");
6974         vpsubsb(dst, src1, src2, vlen_enc);
6975       }
6976       break;
6977     case T_SHORT:
6978       if (ideal_opc == Op_SaturatingAddV) {
6979         vpaddsw(dst, src1, src2, vlen_enc);
6980       } else {
6981         assert(ideal_opc == Op_SaturatingSubV, "");
6982         vpsubsw(dst, src1, src2, vlen_enc);
6983       }
6984       break;
6985     default:
6986       fatal("Unsupported type %s", type2name(elem_bt));
6987       break;
6988   }
6989 }
6990 
6991 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6992   switch(elem_bt) {
6993     case T_BYTE:
6994       if (ideal_opc == Op_SaturatingAddV) {
6995         vpaddusb(dst, src1, src2, vlen_enc);
6996       } else {
6997         assert(ideal_opc == Op_SaturatingSubV, "");
6998         vpsubusb(dst, src1, src2, vlen_enc);
6999       }
7000       break;
7001     case T_SHORT:
7002       if (ideal_opc == Op_SaturatingAddV) {
7003         vpaddusw(dst, src1, src2, vlen_enc);
7004       } else {
7005         assert(ideal_opc == Op_SaturatingSubV, "");
7006         vpsubusw(dst, src1, src2, vlen_enc);
7007       }
7008       break;
7009     default:
7010       fatal("Unsupported type %s", type2name(elem_bt));
7011       break;
7012   }
7013 }
7014 
7015 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
7016                                                      XMMRegister src2, int vlen_enc) {
7017   switch(elem_bt) {
7018     case T_BYTE:
7019       evpermi2b(dst, src1, src2, vlen_enc);
7020       break;
7021     case T_SHORT:
7022       evpermi2w(dst, src1, src2, vlen_enc);
7023       break;
7024     case T_INT:
7025       evpermi2d(dst, src1, src2, vlen_enc);
7026       break;
7027     case T_LONG:
7028       evpermi2q(dst, src1, src2, vlen_enc);
7029       break;
7030     case T_FLOAT:
7031       evpermi2ps(dst, src1, src2, vlen_enc);
7032       break;
7033     case T_DOUBLE:
7034       evpermi2pd(dst, src1, src2, vlen_enc);
7035       break;
7036     default:
7037       fatal("Unsupported type %s", type2name(elem_bt));
7038       break;
7039   }
7040 }
7041 
7042 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
7043   if (is_unsigned) {
7044     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7045   } else {
7046     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7047   }
7048 }
7049 
7050 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
7051   if (is_unsigned) {
7052     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7053   } else {
7054     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7055   }
7056 }
7057 
7058 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
7059   switch(opcode) {
7060     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7061     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7062     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7063     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7064     default: assert(false, "%s", NodeClassNames[opcode]); break;
7065   }
7066 }
7067 
7068 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7069   switch(opcode) {
7070     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7071     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7072     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7073     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7074     default: assert(false, "%s", NodeClassNames[opcode]); break;
7075   }
7076 }
7077 
7078 void C2_MacroAssembler::sminmax_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7079                                      KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) {
7080   vminmax_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit);
7081 }
7082 
7083 void C2_MacroAssembler::sminmax_fp16_avx10_2(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7084                                              KRegister ktmp) {
7085   if (opcode == Op_MaxHF) {
7086     // dst = max(src1, src2)
7087     evminmaxsh(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MAX_COMPARE_SIGN);
7088   } else {
7089     assert(opcode == Op_MinHF, "");
7090     // dst = min(src1, src2)
7091     evminmaxsh(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MIN_COMPARE_SIGN);
7092   }
7093 }
7094 
7095 void C2_MacroAssembler::vminmax_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7096                                      KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
7097   if (opcode == Op_MaxVHF || opcode == Op_MaxHF) {
7098     // Move sign bits of src2 to mask register.
7099     evpmovw2m(ktmp, src2, vlen_enc);
7100     // xtmp1 = src2 < 0 ? src2 : src1
7101     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7102     // xtmp2 = src2 < 0 ? ? src1 : src2
7103     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7104     // Idea behind above swapping is to make seconds source operand a +ve value.
7105     // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in
7106     // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction,
7107     // the second source operand, either a NaN or a valid floating-point value, is returned
7108     // dst = max(xtmp1, xtmp2)
7109     evmaxph(dst, xtmp1, xtmp2, vlen_enc);
7110     // isNaN = is_unordered_quiet(xtmp1)
7111     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7112     // Final result is same as first source if its a NaN value,
7113     // in case second operand holds a NaN value then as per above semantics
7114     // result is same as second operand.
7115     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7116   } else {
7117     assert(opcode == Op_MinVHF || opcode == Op_MinHF, "");
7118     // Move sign bits of src1 to mask register.
7119     evpmovw2m(ktmp, src1, vlen_enc);
7120     // xtmp1 = src1 < 0 ? src2 : src1
7121     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7122     // xtmp2 = src1 < 0 ? src1 : src2
7123     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7124     // Idea behind above swapping is to make seconds source operand a -ve value.
7125     // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in
7126     // the second source operand is returned.
7127     // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN
7128     // or a valid floating-point value, is written to the result.
7129     // dst = min(xtmp1, xtmp2)
7130     evminph(dst, xtmp1, xtmp2, vlen_enc);
7131     // isNaN = is_unordered_quiet(xtmp1)
7132     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7133     // Final result is same as first source if its a NaN value,
7134     // in case second operand holds a NaN value then as per above semantics
7135     // result is same as second operand.
7136     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7137   }
7138 }
7139 
7140 void C2_MacroAssembler::vminmax_fp16_avx10_2(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7141                                              KRegister ktmp, int vlen_enc) {
7142   if (opcode == Op_MaxVHF) {
7143     // dst = max(src1, src2)
7144     evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vlen_enc);
7145   } else {
7146     assert(opcode == Op_MinVHF, "");
7147     // dst = min(src1, src2)
7148     evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vlen_enc);
7149   }
7150 }
7151 
7152 void C2_MacroAssembler::vminmax_fp16_avx10_2(int opcode, XMMRegister dst, XMMRegister src1, Address src2,
7153                                              KRegister ktmp, int vlen_enc) {
7154   if (opcode == Op_MaxVHF) {
7155     // dst = max(src1, src2)
7156     evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vlen_enc);
7157   } else {
7158     assert(opcode == Op_MinVHF, "");
7159     // dst = min(src1, src2)
7160     evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vlen_enc);
7161   }
7162 }
7163 
7164 int C2_MacroAssembler::vector_iota_entry_index(BasicType bt) {
7165   // The vector iota entries array is ordered by type B/S/I/L/F/D, and
7166   // the offset between two types is 16.
7167   switch(bt) {
7168   case T_BYTE:
7169     return 0;
7170   case T_SHORT:
7171     return 1;
7172   case T_INT:
7173     return 2;
7174   case T_LONG:
7175     return 3;
7176   case T_FLOAT:
7177     return 4;
7178   case T_DOUBLE:
7179     return 5;
7180   default:
7181     ShouldNotReachHere();
7182   }
7183 }