1 /*
   2  * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "gc/shared/barrierSet.hpp"
  28 #include "gc/shared/barrierSetAssembler.hpp"
  29 #include "oops/methodData.hpp"
  30 #include "opto/c2_MacroAssembler.hpp"
  31 #include "opto/intrinsicnode.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/opcodes.hpp"
  34 #include "opto/subnode.hpp"
  35 #include "runtime/globals.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/objectMonitorTable.hpp"
  38 #include "runtime/stubRoutines.hpp"
  39 #include "runtime/synchronizer.hpp"
  40 #include "utilities/checkedCast.hpp"
  41 #include "utilities/globalDefinitions.hpp"
  42 #include "utilities/powerOfTwo.hpp"
  43 #include "utilities/sizes.hpp"
  44 
  45 #ifdef PRODUCT
  46 #define BLOCK_COMMENT(str) /* nothing */
  47 #define STOP(error) stop(error)
  48 #else
  49 #define BLOCK_COMMENT(str) block_comment(str)
  50 #define STOP(error) block_comment(error); stop(error)
  51 #endif
  52 
  53 // C2 compiled method's prolog code.
  54 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
  55   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  56 
  57   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  58   // Remove word for return addr
  59   framesize -= wordSize;
  60   stack_bang_size -= wordSize;
  61 
  62   // Calls to C2R adapters often do not accept exceptional returns.
  63   // We require that their callers must bang for them.  But be careful, because
  64   // some VM calls (such as call site linkage) can use several kilobytes of
  65   // stack.  But the stack safety zone should account for that.
  66   // See bugs 4446381, 4468289, 4497237.
  67   if (stack_bang_size > 0) {
  68     generate_stack_overflow_check(stack_bang_size);
  69 
  70     // We always push rbp, so that on return to interpreter rbp, will be
  71     // restored correctly and we can correct the stack.
  72     push(rbp);
  73     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  74     if (PreserveFramePointer) {
  75       mov(rbp, rsp);
  76     }
  77     // Remove word for ebp
  78     framesize -= wordSize;
  79 
  80     // Create frame
  81     if (framesize) {
  82       subptr(rsp, framesize);
  83     }
  84   } else {
  85     subptr(rsp, framesize);
  86 
  87     // Save RBP register now.
  88     framesize -= wordSize;
  89     movptr(Address(rsp, framesize), rbp);
  90     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  91     if (PreserveFramePointer) {
  92       movptr(rbp, rsp);
  93       if (framesize > 0) {
  94         addptr(rbp, framesize);
  95       }
  96     }
  97   }
  98 
  99   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 100     framesize -= wordSize;
 101     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 102   }
 103 
 104 #ifdef ASSERT
 105   if (VerifyStackAtCalls) {
 106     Label L;
 107     push(rax);
 108     mov(rax, rsp);
 109     andptr(rax, StackAlignmentInBytes-1);
 110     cmpptr(rax, StackAlignmentInBytes-wordSize);
 111     pop(rax);
 112     jcc(Assembler::equal, L);
 113     STOP("Stack is not properly aligned!");
 114     bind(L);
 115   }
 116 #endif
 117 
 118   if (!is_stub) {
 119     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 120     // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 121     Label dummy_slow_path;
 122     Label dummy_continuation;
 123     Label* slow_path = &dummy_slow_path;
 124     Label* continuation = &dummy_continuation;
 125     if (!Compile::current()->output()->in_scratch_emit_size()) {
 126       // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 127       C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 128       Compile::current()->output()->add_stub(stub);
 129       slow_path = &stub->entry();
 130       continuation = &stub->continuation();
 131     }
 132     bs->nmethod_entry_barrier(this, slow_path, continuation);
 133   }
 134 }
 135 
 136 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 137   switch (vlen_in_bytes) {
 138     case  4: // fall-through
 139     case  8: // fall-through
 140     case 16: return Assembler::AVX_128bit;
 141     case 32: return Assembler::AVX_256bit;
 142     case 64: return Assembler::AVX_512bit;
 143 
 144     default: {
 145       ShouldNotReachHere();
 146       return Assembler::AVX_NoVec;
 147     }
 148   }
 149 }
 150 
 151 // fast_lock and fast_unlock used by C2
 152 
 153 // Because the transitions from emitted code to the runtime
 154 // monitorenter/exit helper stubs are so slow it's critical that
 155 // we inline both the stack-locking fast path and the inflated fast path.
 156 //
 157 // See also: cmpFastLock and cmpFastUnlock.
 158 //
 159 // What follows is a specialized inline transliteration of the code
 160 // in enter() and exit(). If we're concerned about I$ bloat another
 161 // option would be to emit TrySlowEnter and TrySlowExit methods
 162 // at startup-time.  These methods would accept arguments as
 163 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 164 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 165 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 166 // In practice, however, the # of lock sites is bounded and is usually small.
 167 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 168 // if the processor uses simple bimodal branch predictors keyed by EIP
 169 // Since the helper routines would be called from multiple synchronization
 170 // sites.
 171 //
 172 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 173 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 174 // to those specialized methods.  That'd give us a mostly platform-independent
 175 // implementation that the JITs could optimize and inline at their pleasure.
 176 // Done correctly, the only time we'd need to cross to native could would be
 177 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 178 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 179 // (b) explicit barriers or fence operations.
 180 //
 181 // TODO:
 182 //
 183 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 184 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 185 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 186 //    the lock operators would typically be faster than reifying Self.
 187 //
 188 // *  Ideally I'd define the primitives as:
 189 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 190 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 191 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 192 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 193 //    Furthermore the register assignments are overconstrained, possibly resulting in
 194 //    sub-optimal code near the synchronization site.
 195 //
 196 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 197 //    Alternately, use a better sp-proximity test.
 198 //
 199 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 200 //    Either one is sufficient to uniquely identify a thread.
 201 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 202 //
 203 // *  Intrinsify notify() and notifyAll() for the common cases where the
 204 //    object is locked by the calling thread but the waitlist is empty.
 205 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 206 //
 207 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 208 //    But beware of excessive branch density on AMD Opterons.
 209 //
 210 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 211 //    or failure of the fast path.  If the fast path fails then we pass
 212 //    control to the slow path, typically in C.  In fast_lock and
 213 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 214 //    will emit a conditional branch immediately after the node.
 215 //    So we have branches to branches and lots of ICC.ZF games.
 216 //    Instead, it might be better to have C2 pass a "FailureLabel"
 217 //    into fast_lock and fast_unlock.  In the case of success, control
 218 //    will drop through the node.  ICC.ZF is undefined at exit.
 219 //    In the case of failure, the node will branch directly to the
 220 //    FailureLabel
 221 
 222 // obj: object to lock
 223 // box: on-stack box address -- KILLED
 224 // rax: tmp -- KILLED
 225 // t  : tmp -- KILLED
 226 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register rax_reg,
 227                                   Register t, Register thread) {
 228   assert(rax_reg == rax, "Used for CAS");
 229   assert_different_registers(obj, box, rax_reg, t, thread);
 230 
 231   // Handle inflated monitor.
 232   Label inflated;
 233   // Finish fast lock successfully. ZF value is irrelevant.
 234   Label locked;
 235   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 236   Label slow_path;
 237 
 238   if (UseObjectMonitorTable) {
 239     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 240     movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
 241   }
 242 
 243   if (DiagnoseSyncOnValueBasedClasses != 0) {
 244     load_klass(rax_reg, obj, t);
 245     testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 246     jcc(Assembler::notZero, slow_path);
 247   }
 248 
 249   const Register mark = t;
 250 
 251   { // Fast Lock
 252 
 253     Label push;
 254 
 255     const Register top = UseObjectMonitorTable ? rax_reg : box;
 256 
 257     // Load the mark.
 258     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 259 
 260     // Prefetch top.
 261     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 262 
 263     // Check for monitor (0b10).
 264     testptr(mark, markWord::monitor_value);
 265     jcc(Assembler::notZero, inflated);
 266 
 267     // Check if lock-stack is full.
 268     cmpl(top, LockStack::end_offset() - 1);
 269     jcc(Assembler::greater, slow_path);
 270 
 271     // Check if recursive.
 272     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 273     jccb(Assembler::equal, push);
 274 
 275     // Try to lock. Transition lock bits 0b01 => 0b00
 276     movptr(rax_reg, mark);
 277     orptr(rax_reg, markWord::unlocked_value);
 278     andptr(mark, ~(int32_t)markWord::unlocked_value);
 279     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 280     jcc(Assembler::notEqual, slow_path);
 281 
 282     if (UseObjectMonitorTable) {
 283       // Need to reload top, clobbered by CAS.
 284       movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 285     }
 286     bind(push);
 287     // After successful lock, push object on lock-stack.
 288     movptr(Address(thread, top), obj);
 289     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 290     jmp(locked);
 291   }
 292 
 293   { // Handle inflated monitor.
 294     bind(inflated);
 295 
 296     const Register monitor = t;
 297 
 298     if (!UseObjectMonitorTable) {
 299       assert(mark == monitor, "should be the same here");
 300     } else {
 301       const Register hash = t;
 302       Label monitor_found;
 303 
 304       // Look for the monitor in the om_cache.
 305 
 306       ByteSize cache_offset   = JavaThread::om_cache_oops_offset();
 307       ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
 308       const int num_unrolled  = OMCache::CAPACITY;
 309       for (int i = 0; i < num_unrolled; i++) {
 310         movptr(monitor, Address(thread,  cache_offset + monitor_offset));
 311         cmpptr(obj, Address(thread, cache_offset));
 312         jccb(Assembler::equal, monitor_found);
 313         cache_offset = cache_offset + OMCache::oop_to_oop_difference();
 314       }
 315 
 316       // Look for the monitor in the table.
 317 
 318       // Get the hash code.
 319       movptr(hash, Address(obj, oopDesc::mark_offset_in_bytes()));
 320       shrq(hash, markWord::hash_shift);
 321       andq(hash, markWord::hash_mask);
 322 
 323       // Get the table and calculate the bucket's address.
 324       lea(rax_reg, ExternalAddress(ObjectMonitorTable::current_table_address()));
 325       movptr(rax_reg, Address(rax_reg));
 326       andq(hash, Address(rax_reg, ObjectMonitorTable::table_capacity_mask_offset()));
 327       movptr(rax_reg, Address(rax_reg, ObjectMonitorTable::table_buckets_offset()));
 328 
 329       // Read the monitor from the bucket.
 330       movptr(monitor, Address(rax_reg, hash, Address::times_ptr));
 331 
 332       // Check if the monitor in the bucket is special (empty, tombstone or removed)
 333       cmpptr(monitor, ObjectMonitorTable::SpecialPointerValues::below_is_special);
 334       jcc(Assembler::below, slow_path);
 335 
 336       // Check if object matches.
 337       movptr(rax_reg, Address(monitor, ObjectMonitor::object_offset()));
 338       BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 339       bs_asm->try_resolve_weak_handle_in_c2(this, rax_reg, slow_path);
 340       cmpptr(rax_reg, obj);
 341       jcc(Assembler::notEqual, slow_path);
 342 
 343       bind(monitor_found);
 344     }
 345     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 346     const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 347     const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
 348 
 349     Label monitor_locked;
 350     // Lock the monitor.
 351 
 352     if (UseObjectMonitorTable) {
 353       // Cache the monitor for unlock before trashing box. On failure to acquire
 354       // the lock, the slow path will reset the entry accordingly (see CacheSetter).
 355       movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
 356     }
 357 
 358     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 359     xorptr(rax_reg, rax_reg);
 360     movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
 361     lock(); cmpxchgptr(box, owner_address);
 362     jccb(Assembler::equal, monitor_locked);
 363 
 364     // Check if recursive.
 365     cmpptr(box, rax_reg);
 366     jccb(Assembler::notEqual, slow_path);
 367 
 368     // Recursive.
 369     increment(recursions_address);
 370 
 371     bind(monitor_locked);
 372   }
 373 
 374   bind(locked);
 375   // Set ZF = 1
 376   xorl(rax_reg, rax_reg);
 377 
 378 #ifdef ASSERT
 379   // Check that locked label is reached with ZF set.
 380   Label zf_correct;
 381   Label zf_bad_zero;
 382   jcc(Assembler::zero, zf_correct);
 383   jmp(zf_bad_zero);
 384 #endif
 385 
 386   bind(slow_path);
 387 #ifdef ASSERT
 388   // Check that slow_path label is reached with ZF not set.
 389   jcc(Assembler::notZero, zf_correct);
 390   stop("Fast Lock ZF != 0");
 391   bind(zf_bad_zero);
 392   stop("Fast Lock ZF != 1");
 393   bind(zf_correct);
 394 #endif
 395   // C2 uses the value of ZF to determine the continuation.
 396 }
 397 
 398 // obj: object to lock
 399 // rax: tmp -- KILLED
 400 // t  : tmp - cannot be obj nor rax -- KILLED
 401 //
 402 // Some commentary on balanced locking:
 403 //
 404 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 405 // Methods that don't have provably balanced locking are forced to run in the
 406 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 407 // The interpreter provides two properties:
 408 // I1:  At return-time the interpreter automatically and quietly unlocks any
 409 //      objects acquired in the current activation (frame).  Recall that the
 410 //      interpreter maintains an on-stack list of locks currently held by
 411 //      a frame.
 412 // I2:  If a method attempts to unlock an object that is not held by the
 413 //      frame the interpreter throws IMSX.
 414 //
 415 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 416 // B() doesn't have provably balanced locking so it runs in the interpreter.
 417 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 418 // is still locked by A().
 419 //
 420 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface
 421 // Specification" states that an object locked by JNI's MonitorEnter should not be
 422 // unlocked by "normal" java-level locking and vice-versa.  The specification doesn't
 423 // specify what will occur if a program engages in such mixed-mode locking, however.
 424 // Arguably given that the spec legislates the JNI case as undefined our implementation
 425 // could reasonably *avoid* checking owner in fast_unlock().
 426 // In the interest of performance we elide m->Owner==Self check in unlock.
 427 // A perfectly viable alternative is to elide the owner check except when
 428 // Xcheck:jni is enabled.
 429 
 430 void C2_MacroAssembler::fast_unlock(Register obj, Register reg_rax, Register t, Register thread) {
 431   assert(reg_rax == rax, "Used for CAS");
 432   assert_different_registers(obj, reg_rax, t);
 433 
 434   // Handle inflated monitor.
 435   Label inflated, inflated_check_lock_stack;
 436   // Finish fast unlock successfully.  MUST jump with ZF == 1
 437   Label unlocked, slow_path;
 438 
 439   const Register mark = t;
 440   const Register monitor = t;
 441   const Register top = UseObjectMonitorTable ? t : reg_rax;
 442   const Register box = reg_rax;
 443 
 444   Label dummy;
 445   C2FastUnlockStub* stub = nullptr;
 446 
 447   if (!Compile::current()->output()->in_scratch_emit_size()) {
 448     stub = new (Compile::current()->comp_arena()) C2FastUnlockStub(obj, mark, reg_rax, thread);
 449     Compile::current()->output()->add_stub(stub);
 450   }
 451 
 452   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 453 
 454   { // Fast Unlock
 455 
 456     // Load top.
 457     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 458 
 459     if (!UseObjectMonitorTable) {
 460       // Prefetch mark.
 461       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 462     }
 463 
 464     // Check if obj is top of lock-stack.
 465     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 466     // Top of lock stack was not obj. Must be monitor.
 467     jcc(Assembler::notEqual, inflated_check_lock_stack);
 468 
 469     // Pop lock-stack.
 470     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 471     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 472 
 473     // Check if recursive.
 474     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 475     jcc(Assembler::equal, unlocked);
 476 
 477     // We elide the monitor check, let the CAS fail instead.
 478 
 479     if (UseObjectMonitorTable) {
 480       // Load mark.
 481       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 482     }
 483 
 484     // Try to unlock. Transition lock bits 0b00 => 0b01
 485     movptr(reg_rax, mark);
 486     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
 487     orptr(mark, markWord::unlocked_value);
 488     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 489     jcc(Assembler::notEqual, push_and_slow_path);
 490     jmp(unlocked);
 491   }
 492 
 493 
 494   { // Handle inflated monitor.
 495     bind(inflated_check_lock_stack);
 496 #ifdef ASSERT
 497     Label check_done;
 498     subl(top, oopSize);
 499     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
 500     jcc(Assembler::below, check_done);
 501     cmpptr(obj, Address(thread, top));
 502     jcc(Assembler::notEqual, inflated_check_lock_stack);
 503     stop("Fast Unlock lock on stack");
 504     bind(check_done);
 505     if (UseObjectMonitorTable) {
 506       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 507     }
 508     testptr(mark, markWord::monitor_value);
 509     jcc(Assembler::notZero, inflated);
 510     stop("Fast Unlock not monitor");
 511 #endif
 512 
 513     bind(inflated);
 514 
 515     if (!UseObjectMonitorTable) {
 516       assert(mark == monitor, "should be the same here");
 517     } else {
 518       // Uses ObjectMonitorTable.  Look for the monitor in our BasicLock on the stack.
 519       movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 520       // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
 521       cmpptr(monitor, alignof(ObjectMonitor*));
 522       jcc(Assembler::below, slow_path);
 523     }
 524     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 525     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
 526     const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
 527     const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag};
 528     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
 529 
 530     Label recursive;
 531 
 532     // Check if recursive.
 533     cmpptr(recursions_address, 0);
 534     jcc(Assembler::notZero, recursive);
 535 
 536     // Set owner to null.
 537     // Release to satisfy the JMM
 538     movptr(owner_address, NULL_WORD);
 539     // We need a full fence after clearing owner to avoid stranding.
 540     // StoreLoad achieves this.
 541     membar(StoreLoad);
 542 
 543     // Check if the entry_list is empty.
 544     cmpptr(entry_list_address, NULL_WORD);
 545     jcc(Assembler::zero, unlocked);    // If so we are done.
 546 
 547     // Check if there is a successor.
 548     cmpptr(succ_address, NULL_WORD);
 549     jcc(Assembler::notZero, unlocked); // If so we are done.
 550 
 551     // Save the monitor pointer in the current thread, so we can try to
 552     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 553     if (!UseObjectMonitorTable) {
 554       andptr(monitor, ~(int32_t)markWord::monitor_value);
 555     }
 556     movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
 557 
 558     orl(t, 1); // Fast Unlock ZF = 0
 559     jmpb(slow_path);
 560 
 561     // Recursive unlock.
 562     bind(recursive);
 563     decrement(recursions_address);
 564   }
 565 
 566   bind(unlocked);
 567   xorl(t, t); // Fast Unlock ZF = 1
 568 
 569 #ifdef ASSERT
 570   // Check that unlocked label is reached with ZF set.
 571   Label zf_correct;
 572   Label zf_bad_zero;
 573   jcc(Assembler::zero, zf_correct);
 574   jmp(zf_bad_zero);
 575 #endif
 576 
 577   bind(slow_path);
 578   if (stub != nullptr) {
 579     bind(stub->slow_path_continuation());
 580   }
 581 #ifdef ASSERT
 582   // Check that stub->continuation() label is reached with ZF not set.
 583   jcc(Assembler::notZero, zf_correct);
 584   stop("Fast Unlock ZF != 0");
 585   bind(zf_bad_zero);
 586   stop("Fast Unlock ZF != 1");
 587   bind(zf_correct);
 588 #endif
 589   // C2 uses the value of ZF to determine the continuation.
 590 }
 591 
 592 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
 593   fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
 594 }
 595 
 596 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) {
 597   const int framesize = Compile::current()->output()->frame_size_in_bytes();
 598   masm->movptr(dst, rsp);
 599   if (framesize > 2 * wordSize) {
 600     masm->addptr(dst, framesize - 2 * wordSize);
 601   }
 602 }
 603 
 604 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
 605   if (PreserveFramePointer) {
 606     // frame pointer is valid
 607 #ifdef ASSERT
 608     // Verify frame pointer value in rbp.
 609     reconstruct_frame_pointer_helper(this, rtmp);
 610     Label L_success;
 611     cmpq(rbp, rtmp);
 612     jccb(Assembler::equal, L_success);
 613     STOP("frame pointer mismatch");
 614     bind(L_success);
 615 #endif // ASSERT
 616   } else {
 617     reconstruct_frame_pointer_helper(this, rbp);
 618   }
 619 }
 620 
 621 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) {
 622   jint lo = t->_lo;
 623   jint hi = t->_hi;
 624   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi);
 625   if (t == TypeInt::INT) {
 626     return;
 627   }
 628 
 629   BLOCK_COMMENT("CastII {");
 630   Label fail;
 631   Label succeed;
 632 
 633   if (lo != min_jint) {
 634     cmpl(val, lo);
 635     jccb(Assembler::less, fail);
 636   }
 637   if (hi != max_jint) {
 638     cmpl(val, hi);
 639     jccb(Assembler::greater, fail);
 640   }
 641   jmpb(succeed);
 642 
 643   bind(fail);
 644   movl(c_rarg0, idx);
 645   movl(c_rarg1, val);
 646   movl(c_rarg2, lo);
 647   movl(c_rarg3, hi);
 648   reconstruct_frame_pointer(rscratch1);
 649   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range)));
 650   hlt();
 651   bind(succeed);
 652   BLOCK_COMMENT("} // CastII");
 653 }
 654 
 655 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
 656   fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
 657 }
 658 
 659 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) {
 660   jlong lo = t->_lo;
 661   jlong hi = t->_hi;
 662   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi);
 663   if (t == TypeLong::LONG) {
 664     return;
 665   }
 666 
 667   BLOCK_COMMENT("CastLL {");
 668   Label fail;
 669   Label succeed;
 670 
 671   auto cmp_val = [&](jlong bound) {
 672     if (is_simm32(bound)) {
 673       cmpq(val, checked_cast<int>(bound));
 674     } else {
 675       mov64(tmp, bound);
 676       cmpq(val, tmp);
 677     }
 678   };
 679 
 680   if (lo != min_jlong) {
 681     cmp_val(lo);
 682     jccb(Assembler::less, fail);
 683   }
 684   if (hi != max_jlong) {
 685     cmp_val(hi);
 686     jccb(Assembler::greater, fail);
 687   }
 688   jmpb(succeed);
 689 
 690   bind(fail);
 691   movl(c_rarg0, idx);
 692   movq(c_rarg1, val);
 693   mov64(c_rarg2, lo);
 694   mov64(c_rarg3, hi);
 695   reconstruct_frame_pointer(rscratch1);
 696   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range)));
 697   hlt();
 698   bind(succeed);
 699   BLOCK_COMMENT("} // CastLL");
 700 }
 701 
 702 //-------------------------------------------------------------------------------------------
 703 // Generic instructions support for use in .ad files C2 code generation
 704 
 705 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 706   if (dst != src) {
 707     movdqu(dst, src);
 708   }
 709   if (opcode == Op_AbsVD) {
 710     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 711   } else {
 712     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 713     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 714   }
 715 }
 716 
 717 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 718   if (opcode == Op_AbsVD) {
 719     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 720   } else {
 721     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 722     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 723   }
 724 }
 725 
 726 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 727   if (dst != src) {
 728     movdqu(dst, src);
 729   }
 730   if (opcode == Op_AbsVF) {
 731     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 732   } else {
 733     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 734     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 735   }
 736 }
 737 
 738 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 739   if (opcode == Op_AbsVF) {
 740     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 741   } else {
 742     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 743     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 744   }
 745 }
 746 
 747 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 748   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 749   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 750 
 751   if (opcode == Op_MinV) {
 752     if (elem_bt == T_BYTE) {
 753       pminsb(dst, src);
 754     } else if (elem_bt == T_SHORT) {
 755       pminsw(dst, src);
 756     } else if (elem_bt == T_INT) {
 757       pminsd(dst, src);
 758     } else {
 759       assert(elem_bt == T_LONG, "required");
 760       assert(tmp == xmm0, "required");
 761       assert_different_registers(dst, src, tmp);
 762       movdqu(xmm0, dst);
 763       pcmpgtq(xmm0, src);
 764       blendvpd(dst, src);  // xmm0 as mask
 765     }
 766   } else { // opcode == Op_MaxV
 767     if (elem_bt == T_BYTE) {
 768       pmaxsb(dst, src);
 769     } else if (elem_bt == T_SHORT) {
 770       pmaxsw(dst, src);
 771     } else if (elem_bt == T_INT) {
 772       pmaxsd(dst, src);
 773     } else {
 774       assert(elem_bt == T_LONG, "required");
 775       assert(tmp == xmm0, "required");
 776       assert_different_registers(dst, src, tmp);
 777       movdqu(xmm0, src);
 778       pcmpgtq(xmm0, dst);
 779       blendvpd(dst, src);  // xmm0 as mask
 780     }
 781   }
 782 }
 783 
 784 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 785                                   XMMRegister src1, Address src2, int vlen_enc) {
 786   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 787   if (opcode == Op_UMinV) {
 788     switch(elem_bt) {
 789       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 790       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 791       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 792       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 793       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 794     }
 795   } else {
 796     assert(opcode == Op_UMaxV, "required");
 797     switch(elem_bt) {
 798       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 799       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 800       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 801       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 802       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 803     }
 804   }
 805 }
 806 
 807 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
 808   // For optimality, leverage a full vector width of 512 bits
 809   // for operations over smaller vector sizes on AVX512 targets.
 810   if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 811     if (opcode == Op_UMaxV) {
 812       evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 813     } else {
 814       assert(opcode == Op_UMinV, "required");
 815       evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 816     }
 817   } else {
 818     // T1 = -1
 819     vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
 820     // T1 = -1 << 63
 821     vpsllq(xtmp1, xtmp1, 63, vlen_enc);
 822     // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
 823     vpaddq(xtmp2, xtmp1, src2, vlen_enc);
 824     // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
 825     vpaddq(xtmp1, xtmp1, src1, vlen_enc);
 826     // Mask = T2 > T1
 827     vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
 828     if (opcode == Op_UMaxV) {
 829       // Res = Mask ? Src2 : Src1
 830       vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
 831     } else {
 832       // Res = Mask ? Src1 : Src2
 833       vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
 834     }
 835   }
 836 }
 837 
 838 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 839                                   XMMRegister src1, XMMRegister src2, int vlen_enc) {
 840   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 841   if (opcode == Op_UMinV) {
 842     switch(elem_bt) {
 843       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 844       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 845       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 846       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 847       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 848     }
 849   } else {
 850     assert(opcode == Op_UMaxV, "required");
 851     switch(elem_bt) {
 852       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 853       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 854       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 855       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 856       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 857     }
 858   }
 859 }
 860 
 861 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 862                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 863                                  int vlen_enc) {
 864   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 865 
 866   if (opcode == Op_MinV) {
 867     if (elem_bt == T_BYTE) {
 868       vpminsb(dst, src1, src2, vlen_enc);
 869     } else if (elem_bt == T_SHORT) {
 870       vpminsw(dst, src1, src2, vlen_enc);
 871     } else if (elem_bt == T_INT) {
 872       vpminsd(dst, src1, src2, vlen_enc);
 873     } else {
 874       assert(elem_bt == T_LONG, "required");
 875       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 876         vpminsq(dst, src1, src2, vlen_enc);
 877       } else {
 878         assert_different_registers(dst, src1, src2);
 879         vpcmpgtq(dst, src1, src2, vlen_enc);
 880         vblendvpd(dst, src1, src2, dst, vlen_enc);
 881       }
 882     }
 883   } else { // opcode == Op_MaxV
 884     if (elem_bt == T_BYTE) {
 885       vpmaxsb(dst, src1, src2, vlen_enc);
 886     } else if (elem_bt == T_SHORT) {
 887       vpmaxsw(dst, src1, src2, vlen_enc);
 888     } else if (elem_bt == T_INT) {
 889       vpmaxsd(dst, src1, src2, vlen_enc);
 890     } else {
 891       assert(elem_bt == T_LONG, "required");
 892       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 893         vpmaxsq(dst, src1, src2, vlen_enc);
 894       } else {
 895         assert_different_registers(dst, src1, src2);
 896         vpcmpgtq(dst, src1, src2, vlen_enc);
 897         vblendvpd(dst, src2, src1, dst, vlen_enc);
 898       }
 899     }
 900   }
 901 }
 902 
 903 // Float/Double min max
 904 
 905 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
 906                                    XMMRegister dst, XMMRegister a, XMMRegister b,
 907                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
 908                                    int vlen_enc) {
 909   assert(UseAVX > 0, "required");
 910   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 911          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 912   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 913   assert_different_registers(a, tmp, atmp, btmp);
 914   assert_different_registers(b, tmp, atmp, btmp);
 915 
 916   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 917   bool is_double_word = is_double_word_type(elem_bt);
 918 
 919   /* Note on 'non-obvious' assembly sequence:
 920    *
 921    * While there are vminps/vmaxps instructions, there are two important differences between hardware
 922    * and Java on how they handle floats:
 923    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
 924    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
 925    *
 926    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
 927    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
 928    *                (only useful when signs differ, noop otherwise)
 929    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
 930 
 931    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
 932    *   btmp = (b < +0.0) ? a : b
 933    *   atmp = (b < +0.0) ? b : a
 934    *   Tmp  = Max_Float(atmp , btmp)
 935    *   Res  = (atmp == NaN) ? atmp : Tmp
 936    */
 937 
 938   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
 939   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
 940   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
 941   XMMRegister mask;
 942 
 943   if (!is_double_word && is_min) {
 944     mask = a;
 945     vblend = &MacroAssembler::vblendvps;
 946     vmaxmin = &MacroAssembler::vminps;
 947     vcmp = &MacroAssembler::vcmpps;
 948   } else if (!is_double_word && !is_min) {
 949     mask = b;
 950     vblend = &MacroAssembler::vblendvps;
 951     vmaxmin = &MacroAssembler::vmaxps;
 952     vcmp = &MacroAssembler::vcmpps;
 953   } else if (is_double_word && is_min) {
 954     mask = a;
 955     vblend = &MacroAssembler::vblendvpd;
 956     vmaxmin = &MacroAssembler::vminpd;
 957     vcmp = &MacroAssembler::vcmppd;
 958   } else {
 959     assert(is_double_word && !is_min, "sanity");
 960     mask = b;
 961     vblend = &MacroAssembler::vblendvpd;
 962     vmaxmin = &MacroAssembler::vmaxpd;
 963     vcmp = &MacroAssembler::vcmppd;
 964   }
 965 
 966   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
 967   XMMRegister maxmin, scratch;
 968   if (dst == btmp) {
 969     maxmin = btmp;
 970     scratch = tmp;
 971   } else {
 972     maxmin = tmp;
 973     scratch = btmp;
 974   }
 975 
 976   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
 977   if (precompute_mask && !is_double_word) {
 978     vpsrad(tmp, mask, 32, vlen_enc);
 979     mask = tmp;
 980   } else if (precompute_mask && is_double_word) {
 981     vpxor(tmp, tmp, tmp, vlen_enc);
 982     vpcmpgtq(tmp, tmp, mask, vlen_enc);
 983     mask = tmp;
 984   }
 985 
 986   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
 987   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
 988   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
 989   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 990   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
 991 }
 992 
 993 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
 994                                     XMMRegister dst, XMMRegister a, XMMRegister b,
 995                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
 996                                     int vlen_enc) {
 997   assert(UseAVX > 2, "required");
 998   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 999          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1000   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1001   assert_different_registers(dst, a, atmp, btmp);
1002   assert_different_registers(dst, b, atmp, btmp);
1003 
1004   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1005   bool is_double_word = is_double_word_type(elem_bt);
1006   bool merge = true;
1007 
1008   if (!is_double_word && is_min) {
1009     evpmovd2m(ktmp, a, vlen_enc);
1010     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1011     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1012     vminps(dst, atmp, btmp, vlen_enc);
1013     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1014     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1015   } else if (!is_double_word && !is_min) {
1016     evpmovd2m(ktmp, b, vlen_enc);
1017     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1018     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1019     vmaxps(dst, atmp, btmp, vlen_enc);
1020     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1021     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1022   } else if (is_double_word && is_min) {
1023     evpmovq2m(ktmp, a, vlen_enc);
1024     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1025     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1026     vminpd(dst, atmp, btmp, vlen_enc);
1027     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1028     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1029   } else {
1030     assert(is_double_word && !is_min, "sanity");
1031     evpmovq2m(ktmp, b, vlen_enc);
1032     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1033     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1034     vmaxpd(dst, atmp, btmp, vlen_enc);
1035     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1036     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1037   }
1038 }
1039 
1040 void C2_MacroAssembler::vminmax_fp(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1041                                    XMMRegister src1, XMMRegister src2, int vlen_enc) {
1042   assert(opc == Op_MinV || opc == Op_MinReductionV ||
1043          opc == Op_MaxV || opc == Op_MaxReductionV, "sanity");
1044 
1045   int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_2_MINMAX_MIN_COMPARE_SIGN
1046                                                          : AVX10_2_MINMAX_MAX_COMPARE_SIGN;
1047   if (elem_bt == T_FLOAT) {
1048     evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc);
1049   } else {
1050     assert(elem_bt == T_DOUBLE, "");
1051     evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc);
1052   }
1053 }
1054 
1055 // Float/Double signum
1056 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1057   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1058 
1059   Label DONE_LABEL;
1060 
1061   // Handle special cases +0.0/-0.0 and NaN, if argument is +0.0/-0.0 or NaN, return argument
1062   // If AVX10.2 (or newer) floating point comparison instructions used, SF=1 for equal and unordered cases
1063   // If other floating point comparison instructions used, ZF=1 for equal and unordered cases
1064   if (opcode == Op_SignumF) {
1065     if (VM_Version::supports_avx10_2()) {
1066       vucomxss(dst, zero);
1067       jcc(Assembler::negative, DONE_LABEL);
1068     } else {
1069       ucomiss(dst, zero);
1070       jcc(Assembler::equal, DONE_LABEL);
1071     }
1072     movflt(dst, one);
1073     jcc(Assembler::above, DONE_LABEL);
1074     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1075   } else if (opcode == Op_SignumD) {
1076     if (VM_Version::supports_avx10_2()) {
1077       vucomxsd(dst, zero);
1078       jcc(Assembler::negative, DONE_LABEL);
1079     } else {
1080       ucomisd(dst, zero);
1081       jcc(Assembler::equal, DONE_LABEL);
1082     }
1083     movdbl(dst, one);
1084     jcc(Assembler::above, DONE_LABEL);
1085     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1086   }
1087 
1088   bind(DONE_LABEL);
1089 }
1090 
1091 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1092   if (sign) {
1093     pmovsxbw(dst, src);
1094   } else {
1095     pmovzxbw(dst, src);
1096   }
1097 }
1098 
1099 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1100   if (sign) {
1101     vpmovsxbw(dst, src, vector_len);
1102   } else {
1103     vpmovzxbw(dst, src, vector_len);
1104   }
1105 }
1106 
1107 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1108   if (sign) {
1109     vpmovsxbd(dst, src, vector_len);
1110   } else {
1111     vpmovzxbd(dst, src, vector_len);
1112   }
1113 }
1114 
1115 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1116   if (sign) {
1117     vpmovsxwd(dst, src, vector_len);
1118   } else {
1119     vpmovzxwd(dst, src, vector_len);
1120   }
1121 }
1122 
1123 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1124                                      int shift, int vector_len) {
1125   if (opcode == Op_RotateLeftV) {
1126     if (etype == T_INT) {
1127       evprold(dst, src, shift, vector_len);
1128     } else {
1129       assert(etype == T_LONG, "expected type T_LONG");
1130       evprolq(dst, src, shift, vector_len);
1131     }
1132   } else {
1133     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1134     if (etype == T_INT) {
1135       evprord(dst, src, shift, vector_len);
1136     } else {
1137       assert(etype == T_LONG, "expected type T_LONG");
1138       evprorq(dst, src, shift, vector_len);
1139     }
1140   }
1141 }
1142 
1143 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1144                                      XMMRegister shift, int vector_len) {
1145   if (opcode == Op_RotateLeftV) {
1146     if (etype == T_INT) {
1147       evprolvd(dst, src, shift, vector_len);
1148     } else {
1149       assert(etype == T_LONG, "expected type T_LONG");
1150       evprolvq(dst, src, shift, vector_len);
1151     }
1152   } else {
1153     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1154     if (etype == T_INT) {
1155       evprorvd(dst, src, shift, vector_len);
1156     } else {
1157       assert(etype == T_LONG, "expected type T_LONG");
1158       evprorvq(dst, src, shift, vector_len);
1159     }
1160   }
1161 }
1162 
1163 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1164   if (opcode == Op_RShiftVI) {
1165     psrad(dst, shift);
1166   } else if (opcode == Op_LShiftVI) {
1167     pslld(dst, shift);
1168   } else {
1169     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1170     psrld(dst, shift);
1171   }
1172 }
1173 
1174 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1175   switch (opcode) {
1176     case Op_RShiftVI:  psrad(dst, shift); break;
1177     case Op_LShiftVI:  pslld(dst, shift); break;
1178     case Op_URShiftVI: psrld(dst, shift); break;
1179 
1180     default: assert(false, "%s", NodeClassNames[opcode]);
1181   }
1182 }
1183 
1184 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1185   if (opcode == Op_RShiftVI) {
1186     vpsrad(dst, nds, shift, vector_len);
1187   } else if (opcode == Op_LShiftVI) {
1188     vpslld(dst, nds, shift, vector_len);
1189   } else {
1190     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1191     vpsrld(dst, nds, shift, vector_len);
1192   }
1193 }
1194 
1195 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1196   switch (opcode) {
1197     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1198     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1199     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1200 
1201     default: assert(false, "%s", NodeClassNames[opcode]);
1202   }
1203 }
1204 
1205 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1206   switch (opcode) {
1207     case Op_RShiftVB:  // fall-through
1208     case Op_RShiftVS:  psraw(dst, shift); break;
1209 
1210     case Op_LShiftVB:  // fall-through
1211     case Op_LShiftVS:  psllw(dst, shift);   break;
1212 
1213     case Op_URShiftVS: // fall-through
1214     case Op_URShiftVB: psrlw(dst, shift);  break;
1215 
1216     default: assert(false, "%s", NodeClassNames[opcode]);
1217   }
1218 }
1219 
1220 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1221   switch (opcode) {
1222     case Op_RShiftVB:  // fall-through
1223     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1224 
1225     case Op_LShiftVB:  // fall-through
1226     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1227 
1228     case Op_URShiftVS: // fall-through
1229     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1230 
1231     default: assert(false, "%s", NodeClassNames[opcode]);
1232   }
1233 }
1234 
1235 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1236   switch (opcode) {
1237     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1238     case Op_LShiftVL:  psllq(dst, shift); break;
1239     case Op_URShiftVL: psrlq(dst, shift); break;
1240 
1241     default: assert(false, "%s", NodeClassNames[opcode]);
1242   }
1243 }
1244 
1245 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1246   if (opcode == Op_RShiftVL) {
1247     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1248   } else if (opcode == Op_LShiftVL) {
1249     psllq(dst, shift);
1250   } else {
1251     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1252     psrlq(dst, shift);
1253   }
1254 }
1255 
1256 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1257   switch (opcode) {
1258     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1259     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1260     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1261 
1262     default: assert(false, "%s", NodeClassNames[opcode]);
1263   }
1264 }
1265 
1266 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1267   if (opcode == Op_RShiftVL) {
1268     evpsraq(dst, nds, shift, vector_len);
1269   } else if (opcode == Op_LShiftVL) {
1270     vpsllq(dst, nds, shift, vector_len);
1271   } else {
1272     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1273     vpsrlq(dst, nds, shift, vector_len);
1274   }
1275 }
1276 
1277 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1278   switch (opcode) {
1279     case Op_RShiftVB:  // fall-through
1280     case Op_RShiftVS:  // fall-through
1281     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1282 
1283     case Op_LShiftVB:  // fall-through
1284     case Op_LShiftVS:  // fall-through
1285     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1286 
1287     case Op_URShiftVB: // fall-through
1288     case Op_URShiftVS: // fall-through
1289     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1290 
1291     default: assert(false, "%s", NodeClassNames[opcode]);
1292   }
1293 }
1294 
1295 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1296   switch (opcode) {
1297     case Op_RShiftVB:  // fall-through
1298     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1299 
1300     case Op_LShiftVB:  // fall-through
1301     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1302 
1303     case Op_URShiftVB: // fall-through
1304     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1305 
1306     default: assert(false, "%s", NodeClassNames[opcode]);
1307   }
1308 }
1309 
1310 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1311   assert(UseAVX >= 2, "required");
1312   switch (opcode) {
1313     case Op_RShiftVL: {
1314       if (UseAVX > 2) {
1315         assert(tmp == xnoreg, "not used");
1316         if (!VM_Version::supports_avx512vl()) {
1317           vlen_enc = Assembler::AVX_512bit;
1318         }
1319         evpsravq(dst, src, shift, vlen_enc);
1320       } else {
1321         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1322         vpsrlvq(dst, src, shift, vlen_enc);
1323         vpsrlvq(tmp, tmp, shift, vlen_enc);
1324         vpxor(dst, dst, tmp, vlen_enc);
1325         vpsubq(dst, dst, tmp, vlen_enc);
1326       }
1327       break;
1328     }
1329     case Op_LShiftVL: {
1330       assert(tmp == xnoreg, "not used");
1331       vpsllvq(dst, src, shift, vlen_enc);
1332       break;
1333     }
1334     case Op_URShiftVL: {
1335       assert(tmp == xnoreg, "not used");
1336       vpsrlvq(dst, src, shift, vlen_enc);
1337       break;
1338     }
1339     default: assert(false, "%s", NodeClassNames[opcode]);
1340   }
1341 }
1342 
1343 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1344 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1345   assert(opcode == Op_LShiftVB ||
1346          opcode == Op_RShiftVB ||
1347          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1348   bool sign = (opcode != Op_URShiftVB);
1349   assert(vector_len == 0, "required");
1350   vextendbd(sign, dst, src, 1);
1351   vpmovzxbd(vtmp, shift, 1);
1352   varshiftd(opcode, dst, dst, vtmp, 1);
1353   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1354   vextracti128_high(vtmp, dst);
1355   vpackusdw(dst, dst, vtmp, 0);
1356 }
1357 
1358 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1359 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1360   assert(opcode == Op_LShiftVB ||
1361          opcode == Op_RShiftVB ||
1362          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1363   bool sign = (opcode != Op_URShiftVB);
1364   int ext_vector_len = vector_len + 1;
1365   vextendbw(sign, dst, src, ext_vector_len);
1366   vpmovzxbw(vtmp, shift, ext_vector_len);
1367   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1368   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1369   if (vector_len == 0) {
1370     vextracti128_high(vtmp, dst);
1371     vpackuswb(dst, dst, vtmp, vector_len);
1372   } else {
1373     vextracti64x4_high(vtmp, dst);
1374     vpackuswb(dst, dst, vtmp, vector_len);
1375     vpermq(dst, dst, 0xD8, vector_len);
1376   }
1377 }
1378 
1379 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1380   switch(typ) {
1381     case T_BYTE:
1382       pinsrb(dst, val, idx);
1383       break;
1384     case T_SHORT:
1385       pinsrw(dst, val, idx);
1386       break;
1387     case T_INT:
1388       pinsrd(dst, val, idx);
1389       break;
1390     case T_LONG:
1391       pinsrq(dst, val, idx);
1392       break;
1393     default:
1394       assert(false,"Should not reach here.");
1395       break;
1396   }
1397 }
1398 
1399 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1400   switch(typ) {
1401     case T_BYTE:
1402       vpinsrb(dst, src, val, idx);
1403       break;
1404     case T_SHORT:
1405       vpinsrw(dst, src, val, idx);
1406       break;
1407     case T_INT:
1408       vpinsrd(dst, src, val, idx);
1409       break;
1410     case T_LONG:
1411       vpinsrq(dst, src, val, idx);
1412       break;
1413     default:
1414       assert(false,"Should not reach here.");
1415       break;
1416   }
1417 }
1418 
1419 void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst,
1420                                          Register base, Register idx_base,
1421                                          Register mask, Register mask_idx,
1422                                          Register rtmp, int vlen_enc) {
1423   vpxor(dst, dst, dst, vlen_enc);
1424   if (elem_bt == T_SHORT) {
1425     for (int i = 0; i < 4; i++) {
1426       // dst[i] = mask[i] ? src[idx_base[i]] : 0
1427       Label skip_load;
1428       btq(mask, mask_idx);
1429       jccb(Assembler::carryClear, skip_load);
1430       movl(rtmp, Address(idx_base, i * 4));
1431       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1432       bind(skip_load);
1433       incq(mask_idx);
1434     }
1435   } else {
1436     assert(elem_bt == T_BYTE, "");
1437     for (int i = 0; i < 8; i++) {
1438       // dst[i] = mask[i] ? src[idx_base[i]] : 0
1439       Label skip_load;
1440       btq(mask, mask_idx);
1441       jccb(Assembler::carryClear, skip_load);
1442       movl(rtmp, Address(idx_base, i * 4));
1443       pinsrb(dst, Address(base, rtmp), i);
1444       bind(skip_load);
1445       incq(mask_idx);
1446     }
1447   }
1448 }
1449 
1450 void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst,
1451                                   Register base, Register idx_base,
1452                                   Register rtmp, int vlen_enc) {
1453   vpxor(dst, dst, dst, vlen_enc);
1454   if (elem_bt == T_SHORT) {
1455     for (int i = 0; i < 4; i++) {
1456       // dst[i] = src[idx_base[i]]
1457       movl(rtmp, Address(idx_base, i * 4));
1458       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1459     }
1460   } else {
1461     assert(elem_bt == T_BYTE, "");
1462     for (int i = 0; i < 8; i++) {
1463       // dst[i] = src[idx_base[i]]
1464       movl(rtmp, Address(idx_base, i * 4));
1465       pinsrb(dst, Address(base, rtmp), i);
1466     }
1467   }
1468 }
1469 
1470 /*
1471  * Gather using hybrid algorithm, first partially unroll scalar loop
1472  * to accumulate values from gather indices into a quad-word(64bit) slice.
1473  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1474  * permutation to place the slice into appropriate vector lane
1475  * locations in destination vector. Following pseudo code describes the
1476  * algorithm in detail:
1477  *
1478  * DST_VEC = ZERO_VEC
1479  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1480  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1481  * FOREACH_ITER:
1482  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1483  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1484  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1485  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1486  *
1487  * With each iteration, doubleword permute indices (0,1) corresponding
1488  * to gathered quadword gets right shifted by two lane positions.
1489  *
1490  */
1491 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1492                                         Register base, Register idx_base,
1493                                         Register mask, XMMRegister xtmp1,
1494                                         XMMRegister xtmp2, XMMRegister temp_dst,
1495                                         Register rtmp, Register mask_idx,
1496                                         Register length, int vector_len, int vlen_enc) {
1497   Label GATHER8_LOOP;
1498   assert(is_subword_type(elem_ty), "");
1499   movl(length, vector_len);
1500   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1501   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1502   vallones(xtmp2, vlen_enc);
1503   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1504   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1505   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1506 
1507   bind(GATHER8_LOOP);
1508     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1509     if (mask == noreg) {
1510       vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc);
1511     } else {
1512       vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc);
1513     }
1514     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1515     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1516     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1517     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1518     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1519     vpor(dst, dst, temp_dst, vlen_enc);
1520     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1521     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1522     jcc(Assembler::notEqual, GATHER8_LOOP);
1523 }
1524 
1525 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1526   switch(typ) {
1527     case T_INT:
1528       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1529       break;
1530     case T_FLOAT:
1531       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1532       break;
1533     case T_LONG:
1534       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1535       break;
1536     case T_DOUBLE:
1537       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1538       break;
1539     default:
1540       assert(false,"Should not reach here.");
1541       break;
1542   }
1543 }
1544 
1545 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1546   switch(typ) {
1547     case T_INT:
1548       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1549       break;
1550     case T_FLOAT:
1551       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1552       break;
1553     case T_LONG:
1554       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1555       break;
1556     case T_DOUBLE:
1557       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1558       break;
1559     default:
1560       assert(false,"Should not reach here.");
1561       break;
1562   }
1563 }
1564 
1565 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1566   switch(typ) {
1567     case T_INT:
1568       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1569       break;
1570     case T_FLOAT:
1571       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1572       break;
1573     case T_LONG:
1574       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1575       break;
1576     case T_DOUBLE:
1577       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1578       break;
1579     default:
1580       assert(false,"Should not reach here.");
1581       break;
1582   }
1583 }
1584 
1585 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1586   if (vlen_in_bytes <= 16) {
1587     pxor (dst, dst);
1588     psubb(dst, src);
1589     switch (elem_bt) {
1590       case T_BYTE:   /* nothing to do */ break;
1591       case T_SHORT:  pmovsxbw(dst, dst); break;
1592       case T_INT:    pmovsxbd(dst, dst); break;
1593       case T_FLOAT:  pmovsxbd(dst, dst); break;
1594       case T_LONG:   pmovsxbq(dst, dst); break;
1595       case T_DOUBLE: pmovsxbq(dst, dst); break;
1596 
1597       default: assert(false, "%s", type2name(elem_bt));
1598     }
1599   } else {
1600     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1601     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1602 
1603     vpxor (dst, dst, dst, vlen_enc);
1604     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1605 
1606     switch (elem_bt) {
1607       case T_BYTE:   /* nothing to do */            break;
1608       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1609       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1610       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1611       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1612       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1613 
1614       default: assert(false, "%s", type2name(elem_bt));
1615     }
1616   }
1617 }
1618 
1619 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1620   if (novlbwdq) {
1621     vpmovsxbd(xtmp, src, vlen_enc);
1622     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1623             Assembler::eq, true, vlen_enc, noreg);
1624   } else {
1625     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1626     vpsubb(xtmp, xtmp, src, vlen_enc);
1627     evpmovb2m(dst, xtmp, vlen_enc);
1628   }
1629 }
1630 
1631 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1632   if (is_integral_type(bt)) {
1633     switch (vlen_in_bytes) {
1634       case 4:  movdl(dst, src);   break;
1635       case 8:  movq(dst, src);    break;
1636       case 16: movdqu(dst, src);  break;
1637       case 32: vmovdqu(dst, src); break;
1638       case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1639       default: ShouldNotReachHere();
1640     }
1641   } else {
1642     switch (vlen_in_bytes) {
1643       case 4:  movflt(dst, src); break;
1644       case 8:  movdbl(dst, src); break;
1645       case 16: movups(dst, src); break;
1646       case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1647       case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1648       default: ShouldNotReachHere();
1649     }
1650   }
1651 }
1652 
1653 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1654   assert(rscratch != noreg || always_reachable(src), "missing");
1655 
1656   if (reachable(src)) {
1657     load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1658   } else {
1659     lea(rscratch, src);
1660     load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1661   }
1662 }
1663 
1664 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1665   int vlen_enc = vector_length_encoding(vlen);
1666   if (VM_Version::supports_avx()) {
1667     if (bt == T_LONG) {
1668       if (VM_Version::supports_avx2()) {
1669         vpbroadcastq(dst, src, vlen_enc);
1670       } else {
1671         vmovddup(dst, src, vlen_enc);
1672       }
1673     } else if (bt == T_DOUBLE) {
1674       if (vlen_enc != Assembler::AVX_128bit) {
1675         vbroadcastsd(dst, src, vlen_enc, noreg);
1676       } else {
1677         vmovddup(dst, src, vlen_enc);
1678       }
1679     } else {
1680       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1681         vpbroadcastd(dst, src, vlen_enc);
1682       } else {
1683         vbroadcastss(dst, src, vlen_enc);
1684       }
1685     }
1686   } else if (VM_Version::supports_sse3()) {
1687     movddup(dst, src);
1688   } else {
1689     load_vector(bt, dst, src, vlen);
1690   }
1691 }
1692 
1693 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1694   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1695   int offset = exact_log2(type2aelembytes(bt)) << 6;
1696   if (is_floating_point_type(bt)) {
1697     offset += 128;
1698   }
1699   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1700   load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1701 }
1702 
1703 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1704 
1705 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1706   int vector_len = Assembler::AVX_128bit;
1707 
1708   switch (opcode) {
1709     case Op_AndReductionV:  pand(dst, src); break;
1710     case Op_OrReductionV:   por (dst, src); break;
1711     case Op_XorReductionV:  pxor(dst, src); break;
1712     case Op_MinReductionV:
1713       switch (typ) {
1714         case T_BYTE:        pminsb(dst, src); break;
1715         case T_SHORT:       pminsw(dst, src); break;
1716         case T_INT:         pminsd(dst, src); break;
1717         case T_LONG:        assert(UseAVX > 2, "required");
1718                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1719         default:            assert(false, "wrong type");
1720       }
1721       break;
1722     case Op_MaxReductionV:
1723       switch (typ) {
1724         case T_BYTE:        pmaxsb(dst, src); break;
1725         case T_SHORT:       pmaxsw(dst, src); break;
1726         case T_INT:         pmaxsd(dst, src); break;
1727         case T_LONG:        assert(UseAVX > 2, "required");
1728                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1729         default:            assert(false, "wrong type");
1730       }
1731       break;
1732     case Op_AddReductionVF: addss(dst, src); break;
1733     case Op_AddReductionVD: addsd(dst, src); break;
1734     case Op_AddReductionVI:
1735       switch (typ) {
1736         case T_BYTE:        paddb(dst, src); break;
1737         case T_SHORT:       paddw(dst, src); break;
1738         case T_INT:         paddd(dst, src); break;
1739         default:            assert(false, "wrong type");
1740       }
1741       break;
1742     case Op_AddReductionVL: paddq(dst, src); break;
1743     case Op_MulReductionVF: mulss(dst, src); break;
1744     case Op_MulReductionVD: mulsd(dst, src); break;
1745     case Op_MulReductionVI:
1746       switch (typ) {
1747         case T_SHORT:       pmullw(dst, src); break;
1748         case T_INT:         pmulld(dst, src); break;
1749         default:            assert(false, "wrong type");
1750       }
1751       break;
1752     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1753                             evpmullq(dst, dst, src, vector_len); break;
1754     default:                assert(false, "wrong opcode");
1755   }
1756 }
1757 
1758 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1759   switch (opcode) {
1760     case Op_AddReductionVF: addps(dst, src); break;
1761     case Op_AddReductionVD: addpd(dst, src); break;
1762     case Op_MulReductionVF: mulps(dst, src); break;
1763     case Op_MulReductionVD: mulpd(dst, src); break;
1764     default:                assert(false, "%s", NodeClassNames[opcode]);
1765   }
1766 }
1767 
1768 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1769   int vector_len = Assembler::AVX_256bit;
1770 
1771   switch (opcode) {
1772     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1773     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1774     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1775     case Op_MinReductionV:
1776       switch (typ) {
1777         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1778         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1779         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1780         case T_LONG:        assert(UseAVX > 2, "required");
1781                             vpminsq(dst, src1, src2, vector_len); break;
1782         default:            assert(false, "wrong type");
1783       }
1784       break;
1785     case Op_MaxReductionV:
1786       switch (typ) {
1787         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1788         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1789         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1790         case T_LONG:        assert(UseAVX > 2, "required");
1791                             vpmaxsq(dst, src1, src2, vector_len); break;
1792         default:            assert(false, "wrong type");
1793       }
1794       break;
1795     case Op_AddReductionVI:
1796       switch (typ) {
1797         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1798         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1799         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1800         default:            assert(false, "wrong type");
1801       }
1802       break;
1803     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1804     case Op_MulReductionVI:
1805       switch (typ) {
1806         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1807         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1808         default:            assert(false, "wrong type");
1809       }
1810       break;
1811     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1812     default:                assert(false, "wrong opcode");
1813   }
1814 }
1815 
1816 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1817   int vector_len = Assembler::AVX_256bit;
1818 
1819   switch (opcode) {
1820     case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1821     case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1822     case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1823     case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1824     default:                assert(false, "%s", NodeClassNames[opcode]);
1825   }
1826 }
1827 
1828 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1829                                   XMMRegister dst, XMMRegister src,
1830                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1831   switch (opcode) {
1832     case Op_AddReductionVF:
1833     case Op_MulReductionVF:
1834       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1835       break;
1836 
1837     case Op_AddReductionVD:
1838     case Op_MulReductionVD:
1839       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1840       break;
1841 
1842     default: assert(false, "wrong opcode");
1843   }
1844 }
1845 
1846 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1847                                             XMMRegister dst, XMMRegister src,
1848                                             XMMRegister vtmp1, XMMRegister vtmp2) {
1849   switch (opcode) {
1850     case Op_AddReductionVF:
1851     case Op_MulReductionVF:
1852       unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1853       break;
1854 
1855     case Op_AddReductionVD:
1856     case Op_MulReductionVD:
1857       unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1858       break;
1859 
1860     default: assert(false, "%s", NodeClassNames[opcode]);
1861   }
1862 }
1863 
1864 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1865                              Register dst, Register src1, XMMRegister src2,
1866                              XMMRegister vtmp1, XMMRegister vtmp2) {
1867   switch (vlen) {
1868     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1869     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1870     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1871     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1872 
1873     default: assert(false, "wrong vector length");
1874   }
1875 }
1876 
1877 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1878                              Register dst, Register src1, XMMRegister src2,
1879                              XMMRegister vtmp1, XMMRegister vtmp2) {
1880   switch (vlen) {
1881     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1882     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1883     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1884     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1885 
1886     default: assert(false, "wrong vector length");
1887   }
1888 }
1889 
1890 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1891                              Register dst, Register src1, XMMRegister src2,
1892                              XMMRegister vtmp1, XMMRegister vtmp2) {
1893   switch (vlen) {
1894     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1895     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1896     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1897     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1898 
1899     default: assert(false, "wrong vector length");
1900   }
1901 }
1902 
1903 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1904                              Register dst, Register src1, XMMRegister src2,
1905                              XMMRegister vtmp1, XMMRegister vtmp2) {
1906   switch (vlen) {
1907     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1908     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1909     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1910     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1911 
1912     default: assert(false, "wrong vector length");
1913   }
1914 }
1915 
1916 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1917                              Register dst, Register src1, XMMRegister src2,
1918                              XMMRegister vtmp1, XMMRegister vtmp2) {
1919   switch (vlen) {
1920     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1921     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1922     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1923 
1924     default: assert(false, "wrong vector length");
1925   }
1926 }
1927 
1928 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1929   switch (vlen) {
1930     case 2:
1931       assert(vtmp2 == xnoreg, "");
1932       reduce2F(opcode, dst, src, vtmp1);
1933       break;
1934     case 4:
1935       assert(vtmp2 == xnoreg, "");
1936       reduce4F(opcode, dst, src, vtmp1);
1937       break;
1938     case 8:
1939       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1940       break;
1941     case 16:
1942       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1943       break;
1944     default: assert(false, "wrong vector length");
1945   }
1946 }
1947 
1948 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1949   switch (vlen) {
1950     case 2:
1951       assert(vtmp2 == xnoreg, "");
1952       reduce2D(opcode, dst, src, vtmp1);
1953       break;
1954     case 4:
1955       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1956       break;
1957     case 8:
1958       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1959       break;
1960     default: assert(false, "wrong vector length");
1961   }
1962 }
1963 
1964 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1965   switch (vlen) {
1966     case 2:
1967       assert(vtmp1 == xnoreg, "");
1968       assert(vtmp2 == xnoreg, "");
1969       unorderedReduce2F(opcode, dst, src);
1970       break;
1971     case 4:
1972       assert(vtmp2 == xnoreg, "");
1973       unorderedReduce4F(opcode, dst, src, vtmp1);
1974       break;
1975     case 8:
1976       unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
1977       break;
1978     case 16:
1979       unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
1980       break;
1981     default: assert(false, "wrong vector length");
1982   }
1983 }
1984 
1985 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1986   switch (vlen) {
1987     case 2:
1988       assert(vtmp1 == xnoreg, "");
1989       assert(vtmp2 == xnoreg, "");
1990       unorderedReduce2D(opcode, dst, src);
1991       break;
1992     case 4:
1993       assert(vtmp2 == xnoreg, "");
1994       unorderedReduce4D(opcode, dst, src, vtmp1);
1995       break;
1996     case 8:
1997       unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
1998       break;
1999     default: assert(false, "wrong vector length");
2000   }
2001 }
2002 
2003 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2004   if (opcode == Op_AddReductionVI) {
2005     if (vtmp1 != src2) {
2006       movdqu(vtmp1, src2);
2007     }
2008     phaddd(vtmp1, vtmp1);
2009   } else {
2010     pshufd(vtmp1, src2, 0x1);
2011     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2012   }
2013   movdl(vtmp2, src1);
2014   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2015   movdl(dst, vtmp1);
2016 }
2017 
2018 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2019   if (opcode == Op_AddReductionVI) {
2020     if (vtmp1 != src2) {
2021       movdqu(vtmp1, src2);
2022     }
2023     phaddd(vtmp1, src2);
2024     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2025   } else {
2026     pshufd(vtmp2, src2, 0xE);
2027     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2028     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2029   }
2030 }
2031 
2032 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2033   if (opcode == Op_AddReductionVI) {
2034     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2035     vextracti128_high(vtmp2, vtmp1);
2036     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2037     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2038   } else {
2039     vextracti128_high(vtmp1, src2);
2040     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2041     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2042   }
2043 }
2044 
2045 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2046   vextracti64x4_high(vtmp2, src2);
2047   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2048   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2049 }
2050 
2051 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2052   pshufd(vtmp2, src2, 0x1);
2053   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2054   movdqu(vtmp1, vtmp2);
2055   psrldq(vtmp1, 2);
2056   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2057   movdqu(vtmp2, vtmp1);
2058   psrldq(vtmp2, 1);
2059   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2060   movdl(vtmp2, src1);
2061   pmovsxbd(vtmp1, vtmp1);
2062   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2063   pextrb(dst, vtmp1, 0x0);
2064   movsbl(dst, dst);
2065 }
2066 
2067 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2068   pshufd(vtmp1, src2, 0xE);
2069   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2070   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2071 }
2072 
2073 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2074   vextracti128_high(vtmp2, src2);
2075   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2076   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2077 }
2078 
2079 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2080   vextracti64x4_high(vtmp1, src2);
2081   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2082   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2083 }
2084 
2085 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2086   pmovsxbw(vtmp2, src2);
2087   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2088 }
2089 
2090 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2091   if (UseAVX > 1) {
2092     int vector_len = Assembler::AVX_256bit;
2093     vpmovsxbw(vtmp1, src2, vector_len);
2094     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2095   } else {
2096     pmovsxbw(vtmp2, src2);
2097     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2098     pshufd(vtmp2, src2, 0x1);
2099     pmovsxbw(vtmp2, src2);
2100     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2101   }
2102 }
2103 
2104 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2105   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2106     int vector_len = Assembler::AVX_512bit;
2107     vpmovsxbw(vtmp1, src2, vector_len);
2108     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2109   } else {
2110     assert(UseAVX >= 2,"Should not reach here.");
2111     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2112     vextracti128_high(vtmp2, src2);
2113     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2114   }
2115 }
2116 
2117 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2118   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2119   vextracti64x4_high(vtmp2, src2);
2120   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2121 }
2122 
2123 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2124   if (opcode == Op_AddReductionVI) {
2125     if (vtmp1 != src2) {
2126       movdqu(vtmp1, src2);
2127     }
2128     phaddw(vtmp1, vtmp1);
2129     phaddw(vtmp1, vtmp1);
2130   } else {
2131     pshufd(vtmp2, src2, 0x1);
2132     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2133     movdqu(vtmp1, vtmp2);
2134     psrldq(vtmp1, 2);
2135     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2136   }
2137   movdl(vtmp2, src1);
2138   pmovsxwd(vtmp1, vtmp1);
2139   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2140   pextrw(dst, vtmp1, 0x0);
2141   movswl(dst, dst);
2142 }
2143 
2144 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2145   if (opcode == Op_AddReductionVI) {
2146     if (vtmp1 != src2) {
2147       movdqu(vtmp1, src2);
2148     }
2149     phaddw(vtmp1, src2);
2150   } else {
2151     pshufd(vtmp1, src2, 0xE);
2152     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2153   }
2154   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2155 }
2156 
2157 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2158   if (opcode == Op_AddReductionVI) {
2159     int vector_len = Assembler::AVX_256bit;
2160     vphaddw(vtmp2, src2, src2, vector_len);
2161     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2162   } else {
2163     vextracti128_high(vtmp2, src2);
2164     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2165   }
2166   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2167 }
2168 
2169 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2170   int vector_len = Assembler::AVX_256bit;
2171   vextracti64x4_high(vtmp1, src2);
2172   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2173   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2174 }
2175 
2176 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2177   pshufd(vtmp2, src2, 0xE);
2178   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2179   movdq(vtmp1, src1);
2180   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2181   movdq(dst, vtmp1);
2182 }
2183 
2184 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2185   vextracti128_high(vtmp1, src2);
2186   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2187   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2188 }
2189 
2190 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2191   vextracti64x4_high(vtmp2, src2);
2192   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2193   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2194 }
2195 
2196 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2197   mov64(temp, -1L);
2198   bzhiq(temp, temp, len);
2199   kmovql(dst, temp);
2200 }
2201 
2202 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2203   reduce_operation_128(T_FLOAT, opcode, dst, src);
2204   pshufd(vtmp, src, 0x1);
2205   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2206 }
2207 
2208 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2209   reduce2F(opcode, dst, src, vtmp);
2210   pshufd(vtmp, src, 0x2);
2211   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2212   pshufd(vtmp, src, 0x3);
2213   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2214 }
2215 
2216 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2217   reduce4F(opcode, dst, src, vtmp2);
2218   vextractf128_high(vtmp2, src);
2219   reduce4F(opcode, dst, vtmp2, vtmp1);
2220 }
2221 
2222 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2223   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2224   vextracti64x4_high(vtmp1, src);
2225   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2226 }
2227 
2228 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2229   pshufd(dst, src, 0x1);
2230   reduce_operation_128(T_FLOAT, opcode, dst, src);
2231 }
2232 
2233 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2234   pshufd(vtmp, src, 0xE);
2235   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2236   unorderedReduce2F(opcode, dst, vtmp);
2237 }
2238 
2239 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2240   vextractf128_high(vtmp1, src);
2241   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2242   unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2243 }
2244 
2245 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2246   vextractf64x4_high(vtmp2, src);
2247   unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2248   unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2249 }
2250 
2251 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2252   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2253   pshufd(vtmp, src, 0xE);
2254   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2255 }
2256 
2257 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2258   reduce2D(opcode, dst, src, vtmp2);
2259   vextractf128_high(vtmp2, src);
2260   reduce2D(opcode, dst, vtmp2, vtmp1);
2261 }
2262 
2263 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2264   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2265   vextracti64x4_high(vtmp1, src);
2266   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2267 }
2268 
2269 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2270   pshufd(dst, src, 0xE);
2271   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2272 }
2273 
2274 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2275   vextractf128_high(vtmp, src);
2276   unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2277   unorderedReduce2D(opcode, dst, vtmp);
2278 }
2279 
2280 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2281   vextractf64x4_high(vtmp2, src);
2282   unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2283   unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2284 }
2285 
2286 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2287   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2288 }
2289 
2290 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2291   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2292 }
2293 
2294 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2295   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2296 }
2297 
2298 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2299                                  int vec_enc) {
2300   switch(elem_bt) {
2301     case T_INT:
2302     case T_FLOAT:
2303       vmaskmovps(dst, src, mask, vec_enc);
2304       break;
2305     case T_LONG:
2306     case T_DOUBLE:
2307       vmaskmovpd(dst, src, mask, vec_enc);
2308       break;
2309     default:
2310       fatal("Unsupported type %s", type2name(elem_bt));
2311       break;
2312   }
2313 }
2314 
2315 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2316                                  int vec_enc) {
2317   switch(elem_bt) {
2318     case T_INT:
2319     case T_FLOAT:
2320       vmaskmovps(dst, src, mask, vec_enc);
2321       break;
2322     case T_LONG:
2323     case T_DOUBLE:
2324       vmaskmovpd(dst, src, mask, vec_enc);
2325       break;
2326     default:
2327       fatal("Unsupported type %s", type2name(elem_bt));
2328       break;
2329   }
2330 }
2331 
2332 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2333                                           XMMRegister dst, XMMRegister src,
2334                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2335                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2336   const int permconst[] = {1, 14};
2337   XMMRegister wsrc = src;
2338   XMMRegister wdst = xmm_0;
2339   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2340 
2341   int vlen_enc = Assembler::AVX_128bit;
2342   if (vlen == 16) {
2343     vlen_enc = Assembler::AVX_256bit;
2344   }
2345 
2346   for (int i = log2(vlen) - 1; i >=0; i--) {
2347     if (i == 0 && !is_dst_valid) {
2348       wdst = dst;
2349     }
2350     if (i == 3) {
2351       vextracti64x4_high(wtmp, wsrc);
2352     } else if (i == 2) {
2353       vextracti128_high(wtmp, wsrc);
2354     } else { // i = [0,1]
2355       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2356     }
2357 
2358     if (VM_Version::supports_avx10_2()) {
2359       vminmax_fp(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc);
2360     } else {
2361       vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2362     }
2363     wsrc = wdst;
2364     vlen_enc = Assembler::AVX_128bit;
2365   }
2366   if (is_dst_valid) {
2367     if (VM_Version::supports_avx10_2()) {
2368       vminmax_fp(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit);
2369     } else {
2370       vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2371     }
2372   }
2373 }
2374 
2375 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2376                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2377                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2378   XMMRegister wsrc = src;
2379   XMMRegister wdst = xmm_0;
2380   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2381   int vlen_enc = Assembler::AVX_128bit;
2382   if (vlen == 8) {
2383     vlen_enc = Assembler::AVX_256bit;
2384   }
2385   for (int i = log2(vlen) - 1; i >=0; i--) {
2386     if (i == 0 && !is_dst_valid) {
2387       wdst = dst;
2388     }
2389     if (i == 1) {
2390       vextracti128_high(wtmp, wsrc);
2391     } else if (i == 2) {
2392       vextracti64x4_high(wtmp, wsrc);
2393     } else {
2394       assert(i == 0, "%d", i);
2395       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2396     }
2397 
2398     if (VM_Version::supports_avx10_2()) {
2399       vminmax_fp(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc);
2400     } else {
2401       vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2402     }
2403 
2404     wsrc = wdst;
2405     vlen_enc = Assembler::AVX_128bit;
2406   }
2407 
2408   if (is_dst_valid) {
2409     if (VM_Version::supports_avx10_2()) {
2410       vminmax_fp(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit);
2411     } else {
2412       vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2413     }
2414   }
2415 }
2416 
2417 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2418   switch (bt) {
2419     case T_BYTE:  pextrb(dst, src, idx); break;
2420     case T_SHORT: pextrw(dst, src, idx); break;
2421     case T_INT:   pextrd(dst, src, idx); break;
2422     case T_LONG:  pextrq(dst, src, idx); break;
2423 
2424     default:
2425       assert(false,"Should not reach here.");
2426       break;
2427   }
2428 }
2429 
2430 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2431   int esize =  type2aelembytes(typ);
2432   int elem_per_lane = 16/esize;
2433   int lane = elemindex / elem_per_lane;
2434   int eindex = elemindex % elem_per_lane;
2435 
2436   if (lane >= 2) {
2437     assert(UseAVX > 2, "required");
2438     vextractf32x4(dst, src, lane & 3);
2439     return dst;
2440   } else if (lane > 0) {
2441     assert(UseAVX > 0, "required");
2442     vextractf128(dst, src, lane);
2443     return dst;
2444   } else {
2445     return src;
2446   }
2447 }
2448 
2449 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2450   if (typ == T_BYTE) {
2451     movsbl(dst, dst);
2452   } else if (typ == T_SHORT) {
2453     movswl(dst, dst);
2454   }
2455 }
2456 
2457 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2458   int esize =  type2aelembytes(typ);
2459   int elem_per_lane = 16/esize;
2460   int eindex = elemindex % elem_per_lane;
2461   assert(is_integral_type(typ),"required");
2462 
2463   if (eindex == 0) {
2464     if (typ == T_LONG) {
2465       movq(dst, src);
2466     } else {
2467       movdl(dst, src);
2468       movsxl(typ, dst);
2469     }
2470   } else {
2471     extract(typ, dst, src, eindex);
2472     movsxl(typ, dst);
2473   }
2474 }
2475 
2476 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2477   int esize =  type2aelembytes(typ);
2478   int elem_per_lane = 16/esize;
2479   int eindex = elemindex % elem_per_lane;
2480   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2481 
2482   if (eindex == 0) {
2483     movq(dst, src);
2484   } else {
2485     if (typ == T_FLOAT) {
2486       if (UseAVX == 0) {
2487         movdqu(dst, src);
2488         shufps(dst, dst, eindex);
2489       } else {
2490         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2491       }
2492     } else {
2493       if (UseAVX == 0) {
2494         movdqu(dst, src);
2495         psrldq(dst, eindex*esize);
2496       } else {
2497         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2498       }
2499       movq(dst, dst);
2500     }
2501   }
2502   // Zero upper bits
2503   if (typ == T_FLOAT) {
2504     if (UseAVX == 0) {
2505       assert(vtmp != xnoreg, "required.");
2506       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2507       pand(dst, vtmp);
2508     } else {
2509       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2510     }
2511   }
2512 }
2513 
2514 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2515   switch(typ) {
2516     case T_BYTE:
2517     case T_BOOLEAN:
2518       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2519       break;
2520     case T_SHORT:
2521     case T_CHAR:
2522       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2523       break;
2524     case T_INT:
2525     case T_FLOAT:
2526       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2527       break;
2528     case T_LONG:
2529     case T_DOUBLE:
2530       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2531       break;
2532     default:
2533       assert(false,"Should not reach here.");
2534       break;
2535   }
2536 }
2537 
2538 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2539   assert(rscratch != noreg || always_reachable(src2), "missing");
2540 
2541   switch(typ) {
2542     case T_BOOLEAN:
2543     case T_BYTE:
2544       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2545       break;
2546     case T_CHAR:
2547     case T_SHORT:
2548       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2549       break;
2550     case T_INT:
2551     case T_FLOAT:
2552       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2553       break;
2554     case T_LONG:
2555     case T_DOUBLE:
2556       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2557       break;
2558     default:
2559       assert(false,"Should not reach here.");
2560       break;
2561   }
2562 }
2563 
2564 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2565   switch(typ) {
2566     case T_BYTE:
2567       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2568       break;
2569     case T_SHORT:
2570       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2571       break;
2572     case T_INT:
2573     case T_FLOAT:
2574       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2575       break;
2576     case T_LONG:
2577     case T_DOUBLE:
2578       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2579       break;
2580     default:
2581       assert(false,"Should not reach here.");
2582       break;
2583   }
2584 }
2585 
2586 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2587   assert(vlen_in_bytes <= 32, "");
2588   int esize = type2aelembytes(bt);
2589   if (vlen_in_bytes == 32) {
2590     assert(vtmp == xnoreg, "required.");
2591     if (esize >= 4) {
2592       vtestps(src1, src2, AVX_256bit);
2593     } else {
2594       vptest(src1, src2, AVX_256bit);
2595     }
2596     return;
2597   }
2598   if (vlen_in_bytes < 16) {
2599     // Duplicate the lower part to fill the whole register,
2600     // Don't need to do so for src2
2601     assert(vtmp != xnoreg, "required");
2602     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2603     pshufd(vtmp, src1, shuffle_imm);
2604   } else {
2605     assert(vtmp == xnoreg, "required");
2606     vtmp = src1;
2607   }
2608   if (esize >= 4 && VM_Version::supports_avx()) {
2609     vtestps(vtmp, src2, AVX_128bit);
2610   } else {
2611     ptest(vtmp, src2);
2612   }
2613 }
2614 
2615 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2616 #ifdef ASSERT
2617   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2618   bool is_bw_supported = VM_Version::supports_avx512bw();
2619   if (is_bw && !is_bw_supported) {
2620     assert(vlen_enc != Assembler::AVX_512bit, "required");
2621     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2622            "XMM register should be 0-15");
2623   }
2624 #endif // ASSERT
2625   switch (elem_bt) {
2626     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2627     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2628     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2629     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2630     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2631     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2632     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2633   }
2634 }
2635 
2636 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2637   assert(UseAVX >= 2, "required");
2638   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2639   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2640   if ((UseAVX > 2) &&
2641       (!is_bw || VM_Version::supports_avx512bw()) &&
2642       (!is_vl || VM_Version::supports_avx512vl())) {
2643     switch (elem_bt) {
2644       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2645       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2646       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2647       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2648       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2649     }
2650   } else {
2651     assert(vlen_enc != Assembler::AVX_512bit, "required");
2652     assert((dst->encoding() < 16),"XMM register should be 0-15");
2653     switch (elem_bt) {
2654       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2655       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2656       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2657       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2658       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2659       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2660       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2661     }
2662   }
2663 }
2664 
2665 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2666   switch (to_elem_bt) {
2667     case T_SHORT:
2668       vpmovsxbw(dst, src, vlen_enc);
2669       break;
2670     case T_INT:
2671       vpmovsxbd(dst, src, vlen_enc);
2672       break;
2673     case T_FLOAT:
2674       vpmovsxbd(dst, src, vlen_enc);
2675       vcvtdq2ps(dst, dst, vlen_enc);
2676       break;
2677     case T_LONG:
2678       vpmovsxbq(dst, src, vlen_enc);
2679       break;
2680     case T_DOUBLE: {
2681       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2682       vpmovsxbd(dst, src, mid_vlen_enc);
2683       vcvtdq2pd(dst, dst, vlen_enc);
2684       break;
2685     }
2686     default:
2687       fatal("Unsupported type %s", type2name(to_elem_bt));
2688       break;
2689   }
2690 }
2691 
2692 //-------------------------------------------------------------------------------------------
2693 
2694 // IndexOf for constant substrings with size >= 8 chars
2695 // which don't need to be loaded through stack.
2696 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2697                                          Register cnt1, Register cnt2,
2698                                          int int_cnt2,  Register result,
2699                                          XMMRegister vec, Register tmp,
2700                                          int ae) {
2701   ShortBranchVerifier sbv(this);
2702   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2703   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2704 
2705   // This method uses the pcmpestri instruction with bound registers
2706   //   inputs:
2707   //     xmm - substring
2708   //     rax - substring length (elements count)
2709   //     mem - scanned string
2710   //     rdx - string length (elements count)
2711   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2712   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2713   //   outputs:
2714   //     rcx - matched index in string
2715   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2716   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2717   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2718   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2719   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2720 
2721   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2722         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2723         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2724 
2725   // Note, inline_string_indexOf() generates checks:
2726   // if (substr.count > string.count) return -1;
2727   // if (substr.count == 0) return 0;
2728   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2729 
2730   // Load substring.
2731   if (ae == StrIntrinsicNode::UL) {
2732     pmovzxbw(vec, Address(str2, 0));
2733   } else {
2734     movdqu(vec, Address(str2, 0));
2735   }
2736   movl(cnt2, int_cnt2);
2737   movptr(result, str1); // string addr
2738 
2739   if (int_cnt2 > stride) {
2740     jmpb(SCAN_TO_SUBSTR);
2741 
2742     // Reload substr for rescan, this code
2743     // is executed only for large substrings (> 8 chars)
2744     bind(RELOAD_SUBSTR);
2745     if (ae == StrIntrinsicNode::UL) {
2746       pmovzxbw(vec, Address(str2, 0));
2747     } else {
2748       movdqu(vec, Address(str2, 0));
2749     }
2750     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2751 
2752     bind(RELOAD_STR);
2753     // We came here after the beginning of the substring was
2754     // matched but the rest of it was not so we need to search
2755     // again. Start from the next element after the previous match.
2756 
2757     // cnt2 is number of substring reminding elements and
2758     // cnt1 is number of string reminding elements when cmp failed.
2759     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2760     subl(cnt1, cnt2);
2761     addl(cnt1, int_cnt2);
2762     movl(cnt2, int_cnt2); // Now restore cnt2
2763 
2764     decrementl(cnt1);     // Shift to next element
2765     cmpl(cnt1, cnt2);
2766     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2767 
2768     addptr(result, (1<<scale1));
2769 
2770   } // (int_cnt2 > 8)
2771 
2772   // Scan string for start of substr in 16-byte vectors
2773   bind(SCAN_TO_SUBSTR);
2774   pcmpestri(vec, Address(result, 0), mode);
2775   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2776   subl(cnt1, stride);
2777   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2778   cmpl(cnt1, cnt2);
2779   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2780   addptr(result, 16);
2781   jmpb(SCAN_TO_SUBSTR);
2782 
2783   // Found a potential substr
2784   bind(FOUND_CANDIDATE);
2785   // Matched whole vector if first element matched (tmp(rcx) == 0).
2786   if (int_cnt2 == stride) {
2787     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2788   } else { // int_cnt2 > 8
2789     jccb(Assembler::overflow, FOUND_SUBSTR);
2790   }
2791   // After pcmpestri tmp(rcx) contains matched element index
2792   // Compute start addr of substr
2793   lea(result, Address(result, tmp, scale1));
2794 
2795   // Make sure string is still long enough
2796   subl(cnt1, tmp);
2797   cmpl(cnt1, cnt2);
2798   if (int_cnt2 == stride) {
2799     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2800   } else { // int_cnt2 > 8
2801     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2802   }
2803   // Left less then substring.
2804 
2805   bind(RET_NOT_FOUND);
2806   movl(result, -1);
2807   jmp(EXIT);
2808 
2809   if (int_cnt2 > stride) {
2810     // This code is optimized for the case when whole substring
2811     // is matched if its head is matched.
2812     bind(MATCH_SUBSTR_HEAD);
2813     pcmpestri(vec, Address(result, 0), mode);
2814     // Reload only string if does not match
2815     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2816 
2817     Label CONT_SCAN_SUBSTR;
2818     // Compare the rest of substring (> 8 chars).
2819     bind(FOUND_SUBSTR);
2820     // First 8 chars are already matched.
2821     negptr(cnt2);
2822     addptr(cnt2, stride);
2823 
2824     bind(SCAN_SUBSTR);
2825     subl(cnt1, stride);
2826     cmpl(cnt2, -stride); // Do not read beyond substring
2827     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2828     // Back-up strings to avoid reading beyond substring:
2829     // cnt1 = cnt1 - cnt2 + 8
2830     addl(cnt1, cnt2); // cnt2 is negative
2831     addl(cnt1, stride);
2832     movl(cnt2, stride); negptr(cnt2);
2833     bind(CONT_SCAN_SUBSTR);
2834     if (int_cnt2 < (int)G) {
2835       int tail_off1 = int_cnt2<<scale1;
2836       int tail_off2 = int_cnt2<<scale2;
2837       if (ae == StrIntrinsicNode::UL) {
2838         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2839       } else {
2840         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2841       }
2842       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2843     } else {
2844       // calculate index in register to avoid integer overflow (int_cnt2*2)
2845       movl(tmp, int_cnt2);
2846       addptr(tmp, cnt2);
2847       if (ae == StrIntrinsicNode::UL) {
2848         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2849       } else {
2850         movdqu(vec, Address(str2, tmp, scale2, 0));
2851       }
2852       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2853     }
2854     // Need to reload strings pointers if not matched whole vector
2855     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2856     addptr(cnt2, stride);
2857     jcc(Assembler::negative, SCAN_SUBSTR);
2858     // Fall through if found full substring
2859 
2860   } // (int_cnt2 > 8)
2861 
2862   bind(RET_FOUND);
2863   // Found result if we matched full small substring.
2864   // Compute substr offset
2865   subptr(result, str1);
2866   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2867     shrl(result, 1); // index
2868   }
2869   bind(EXIT);
2870 
2871 } // string_indexofC8
2872 
2873 // Small strings are loaded through stack if they cross page boundary.
2874 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2875                                        Register cnt1, Register cnt2,
2876                                        int int_cnt2,  Register result,
2877                                        XMMRegister vec, Register tmp,
2878                                        int ae) {
2879   ShortBranchVerifier sbv(this);
2880   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2881   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2882 
2883   //
2884   // int_cnt2 is length of small (< 8 chars) constant substring
2885   // or (-1) for non constant substring in which case its length
2886   // is in cnt2 register.
2887   //
2888   // Note, inline_string_indexOf() generates checks:
2889   // if (substr.count > string.count) return -1;
2890   // if (substr.count == 0) return 0;
2891   //
2892   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2893   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2894   // This method uses the pcmpestri instruction with bound registers
2895   //   inputs:
2896   //     xmm - substring
2897   //     rax - substring length (elements count)
2898   //     mem - scanned string
2899   //     rdx - string length (elements count)
2900   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2901   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2902   //   outputs:
2903   //     rcx - matched index in string
2904   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2905   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2906   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2907   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2908 
2909   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2910         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2911         FOUND_CANDIDATE;
2912 
2913   { //========================================================
2914     // We don't know where these strings are located
2915     // and we can't read beyond them. Load them through stack.
2916     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2917 
2918     movptr(tmp, rsp); // save old SP
2919 
2920     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2921       if (int_cnt2 == (1>>scale2)) { // One byte
2922         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2923         load_unsigned_byte(result, Address(str2, 0));
2924         movdl(vec, result); // move 32 bits
2925       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2926         // Not enough header space in 32-bit VM: 12+3 = 15.
2927         movl(result, Address(str2, -1));
2928         shrl(result, 8);
2929         movdl(vec, result); // move 32 bits
2930       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2931         load_unsigned_short(result, Address(str2, 0));
2932         movdl(vec, result); // move 32 bits
2933       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2934         movdl(vec, Address(str2, 0)); // move 32 bits
2935       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2936         movq(vec, Address(str2, 0));  // move 64 bits
2937       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2938         // Array header size is 12 bytes in 32-bit VM
2939         // + 6 bytes for 3 chars == 18 bytes,
2940         // enough space to load vec and shift.
2941         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2942         if (ae == StrIntrinsicNode::UL) {
2943           int tail_off = int_cnt2-8;
2944           pmovzxbw(vec, Address(str2, tail_off));
2945           psrldq(vec, -2*tail_off);
2946         }
2947         else {
2948           int tail_off = int_cnt2*(1<<scale2);
2949           movdqu(vec, Address(str2, tail_off-16));
2950           psrldq(vec, 16-tail_off);
2951         }
2952       }
2953     } else { // not constant substring
2954       cmpl(cnt2, stride);
2955       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2956 
2957       // We can read beyond string if srt+16 does not cross page boundary
2958       // since heaps are aligned and mapped by pages.
2959       assert(os::vm_page_size() < (int)G, "default page should be small");
2960       movl(result, str2); // We need only low 32 bits
2961       andl(result, ((int)os::vm_page_size()-1));
2962       cmpl(result, ((int)os::vm_page_size()-16));
2963       jccb(Assembler::belowEqual, CHECK_STR);
2964 
2965       // Move small strings to stack to allow load 16 bytes into vec.
2966       subptr(rsp, 16);
2967       int stk_offset = wordSize-(1<<scale2);
2968       push(cnt2);
2969 
2970       bind(COPY_SUBSTR);
2971       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2972         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2973         movb(Address(rsp, cnt2, scale2, stk_offset), result);
2974       } else if (ae == StrIntrinsicNode::UU) {
2975         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2976         movw(Address(rsp, cnt2, scale2, stk_offset), result);
2977       }
2978       decrement(cnt2);
2979       jccb(Assembler::notZero, COPY_SUBSTR);
2980 
2981       pop(cnt2);
2982       movptr(str2, rsp);  // New substring address
2983     } // non constant
2984 
2985     bind(CHECK_STR);
2986     cmpl(cnt1, stride);
2987     jccb(Assembler::aboveEqual, BIG_STRINGS);
2988 
2989     // Check cross page boundary.
2990     movl(result, str1); // We need only low 32 bits
2991     andl(result, ((int)os::vm_page_size()-1));
2992     cmpl(result, ((int)os::vm_page_size()-16));
2993     jccb(Assembler::belowEqual, BIG_STRINGS);
2994 
2995     subptr(rsp, 16);
2996     int stk_offset = -(1<<scale1);
2997     if (int_cnt2 < 0) { // not constant
2998       push(cnt2);
2999       stk_offset += wordSize;
3000     }
3001     movl(cnt2, cnt1);
3002 
3003     bind(COPY_STR);
3004     if (ae == StrIntrinsicNode::LL) {
3005       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3006       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3007     } else {
3008       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3009       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3010     }
3011     decrement(cnt2);
3012     jccb(Assembler::notZero, COPY_STR);
3013 
3014     if (int_cnt2 < 0) { // not constant
3015       pop(cnt2);
3016     }
3017     movptr(str1, rsp);  // New string address
3018 
3019     bind(BIG_STRINGS);
3020     // Load substring.
3021     if (int_cnt2 < 0) { // -1
3022       if (ae == StrIntrinsicNode::UL) {
3023         pmovzxbw(vec, Address(str2, 0));
3024       } else {
3025         movdqu(vec, Address(str2, 0));
3026       }
3027       push(cnt2);       // substr count
3028       push(str2);       // substr addr
3029       push(str1);       // string addr
3030     } else {
3031       // Small (< 8 chars) constant substrings are loaded already.
3032       movl(cnt2, int_cnt2);
3033     }
3034     push(tmp);  // original SP
3035 
3036   } // Finished loading
3037 
3038   //========================================================
3039   // Start search
3040   //
3041 
3042   movptr(result, str1); // string addr
3043 
3044   if (int_cnt2  < 0) {  // Only for non constant substring
3045     jmpb(SCAN_TO_SUBSTR);
3046 
3047     // SP saved at sp+0
3048     // String saved at sp+1*wordSize
3049     // Substr saved at sp+2*wordSize
3050     // Substr count saved at sp+3*wordSize
3051 
3052     // Reload substr for rescan, this code
3053     // is executed only for large substrings (> 8 chars)
3054     bind(RELOAD_SUBSTR);
3055     movptr(str2, Address(rsp, 2*wordSize));
3056     movl(cnt2, Address(rsp, 3*wordSize));
3057     if (ae == StrIntrinsicNode::UL) {
3058       pmovzxbw(vec, Address(str2, 0));
3059     } else {
3060       movdqu(vec, Address(str2, 0));
3061     }
3062     // We came here after the beginning of the substring was
3063     // matched but the rest of it was not so we need to search
3064     // again. Start from the next element after the previous match.
3065     subptr(str1, result); // Restore counter
3066     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3067       shrl(str1, 1);
3068     }
3069     addl(cnt1, str1);
3070     decrementl(cnt1);   // Shift to next element
3071     cmpl(cnt1, cnt2);
3072     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3073 
3074     addptr(result, (1<<scale1));
3075   } // non constant
3076 
3077   // Scan string for start of substr in 16-byte vectors
3078   bind(SCAN_TO_SUBSTR);
3079   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3080   pcmpestri(vec, Address(result, 0), mode);
3081   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3082   subl(cnt1, stride);
3083   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3084   cmpl(cnt1, cnt2);
3085   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3086   addptr(result, 16);
3087 
3088   bind(ADJUST_STR);
3089   cmpl(cnt1, stride); // Do not read beyond string
3090   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3091   // Back-up string to avoid reading beyond string.
3092   lea(result, Address(result, cnt1, scale1, -16));
3093   movl(cnt1, stride);
3094   jmpb(SCAN_TO_SUBSTR);
3095 
3096   // Found a potential substr
3097   bind(FOUND_CANDIDATE);
3098   // After pcmpestri tmp(rcx) contains matched element index
3099 
3100   // Make sure string is still long enough
3101   subl(cnt1, tmp);
3102   cmpl(cnt1, cnt2);
3103   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3104   // Left less then substring.
3105 
3106   bind(RET_NOT_FOUND);
3107   movl(result, -1);
3108   jmp(CLEANUP);
3109 
3110   bind(FOUND_SUBSTR);
3111   // Compute start addr of substr
3112   lea(result, Address(result, tmp, scale1));
3113   if (int_cnt2 > 0) { // Constant substring
3114     // Repeat search for small substring (< 8 chars)
3115     // from new point without reloading substring.
3116     // Have to check that we don't read beyond string.
3117     cmpl(tmp, stride-int_cnt2);
3118     jccb(Assembler::greater, ADJUST_STR);
3119     // Fall through if matched whole substring.
3120   } else { // non constant
3121     assert(int_cnt2 == -1, "should be != 0");
3122 
3123     addl(tmp, cnt2);
3124     // Found result if we matched whole substring.
3125     cmpl(tmp, stride);
3126     jcc(Assembler::lessEqual, RET_FOUND);
3127 
3128     // Repeat search for small substring (<= 8 chars)
3129     // from new point 'str1' without reloading substring.
3130     cmpl(cnt2, stride);
3131     // Have to check that we don't read beyond string.
3132     jccb(Assembler::lessEqual, ADJUST_STR);
3133 
3134     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3135     // Compare the rest of substring (> 8 chars).
3136     movptr(str1, result);
3137 
3138     cmpl(tmp, cnt2);
3139     // First 8 chars are already matched.
3140     jccb(Assembler::equal, CHECK_NEXT);
3141 
3142     bind(SCAN_SUBSTR);
3143     pcmpestri(vec, Address(str1, 0), mode);
3144     // Need to reload strings pointers if not matched whole vector
3145     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3146 
3147     bind(CHECK_NEXT);
3148     subl(cnt2, stride);
3149     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3150     addptr(str1, 16);
3151     if (ae == StrIntrinsicNode::UL) {
3152       addptr(str2, 8);
3153     } else {
3154       addptr(str2, 16);
3155     }
3156     subl(cnt1, stride);
3157     cmpl(cnt2, stride); // Do not read beyond substring
3158     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3159     // Back-up strings to avoid reading beyond substring.
3160 
3161     if (ae == StrIntrinsicNode::UL) {
3162       lea(str2, Address(str2, cnt2, scale2, -8));
3163       lea(str1, Address(str1, cnt2, scale1, -16));
3164     } else {
3165       lea(str2, Address(str2, cnt2, scale2, -16));
3166       lea(str1, Address(str1, cnt2, scale1, -16));
3167     }
3168     subl(cnt1, cnt2);
3169     movl(cnt2, stride);
3170     addl(cnt1, stride);
3171     bind(CONT_SCAN_SUBSTR);
3172     if (ae == StrIntrinsicNode::UL) {
3173       pmovzxbw(vec, Address(str2, 0));
3174     } else {
3175       movdqu(vec, Address(str2, 0));
3176     }
3177     jmp(SCAN_SUBSTR);
3178 
3179     bind(RET_FOUND_LONG);
3180     movptr(str1, Address(rsp, wordSize));
3181   } // non constant
3182 
3183   bind(RET_FOUND);
3184   // Compute substr offset
3185   subptr(result, str1);
3186   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3187     shrl(result, 1); // index
3188   }
3189   bind(CLEANUP);
3190   pop(rsp); // restore SP
3191 
3192 } // string_indexof
3193 
3194 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3195                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3196   ShortBranchVerifier sbv(this);
3197   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3198 
3199   int stride = 8;
3200 
3201   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3202         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3203         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3204         FOUND_SEQ_CHAR, DONE_LABEL;
3205 
3206   movptr(result, str1);
3207   if (UseAVX >= 2) {
3208     cmpl(cnt1, stride);
3209     jcc(Assembler::less, SCAN_TO_CHAR);
3210     cmpl(cnt1, 2*stride);
3211     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3212     movdl(vec1, ch);
3213     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3214     vpxor(vec2, vec2);
3215     movl(tmp, cnt1);
3216     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3217     andl(cnt1,0x0000000F);  //tail count (in chars)
3218 
3219     bind(SCAN_TO_16_CHAR_LOOP);
3220     vmovdqu(vec3, Address(result, 0));
3221     vpcmpeqw(vec3, vec3, vec1, 1);
3222     vptest(vec2, vec3);
3223     jcc(Assembler::carryClear, FOUND_CHAR);
3224     addptr(result, 32);
3225     subl(tmp, 2*stride);
3226     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3227     jmp(SCAN_TO_8_CHAR);
3228     bind(SCAN_TO_8_CHAR_INIT);
3229     movdl(vec1, ch);
3230     pshuflw(vec1, vec1, 0x00);
3231     pshufd(vec1, vec1, 0);
3232     pxor(vec2, vec2);
3233   }
3234   bind(SCAN_TO_8_CHAR);
3235   cmpl(cnt1, stride);
3236   jcc(Assembler::less, SCAN_TO_CHAR);
3237   if (UseAVX < 2) {
3238     movdl(vec1, ch);
3239     pshuflw(vec1, vec1, 0x00);
3240     pshufd(vec1, vec1, 0);
3241     pxor(vec2, vec2);
3242   }
3243   movl(tmp, cnt1);
3244   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3245   andl(cnt1,0x00000007);  //tail count (in chars)
3246 
3247   bind(SCAN_TO_8_CHAR_LOOP);
3248   movdqu(vec3, Address(result, 0));
3249   pcmpeqw(vec3, vec1);
3250   ptest(vec2, vec3);
3251   jcc(Assembler::carryClear, FOUND_CHAR);
3252   addptr(result, 16);
3253   subl(tmp, stride);
3254   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3255   bind(SCAN_TO_CHAR);
3256   testl(cnt1, cnt1);
3257   jcc(Assembler::zero, RET_NOT_FOUND);
3258   bind(SCAN_TO_CHAR_LOOP);
3259   load_unsigned_short(tmp, Address(result, 0));
3260   cmpl(ch, tmp);
3261   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3262   addptr(result, 2);
3263   subl(cnt1, 1);
3264   jccb(Assembler::zero, RET_NOT_FOUND);
3265   jmp(SCAN_TO_CHAR_LOOP);
3266 
3267   bind(RET_NOT_FOUND);
3268   movl(result, -1);
3269   jmpb(DONE_LABEL);
3270 
3271   bind(FOUND_CHAR);
3272   if (UseAVX >= 2) {
3273     vpmovmskb(tmp, vec3);
3274   } else {
3275     pmovmskb(tmp, vec3);
3276   }
3277   bsfl(ch, tmp);
3278   addptr(result, ch);
3279 
3280   bind(FOUND_SEQ_CHAR);
3281   subptr(result, str1);
3282   shrl(result, 1);
3283 
3284   bind(DONE_LABEL);
3285 } // string_indexof_char
3286 
3287 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3288                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3289   ShortBranchVerifier sbv(this);
3290   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3291 
3292   int stride = 16;
3293 
3294   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3295         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3296         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3297         FOUND_SEQ_CHAR, DONE_LABEL;
3298 
3299   movptr(result, str1);
3300   if (UseAVX >= 2) {
3301     cmpl(cnt1, stride);
3302     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3303     cmpl(cnt1, stride*2);
3304     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3305     movdl(vec1, ch);
3306     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3307     vpxor(vec2, vec2);
3308     movl(tmp, cnt1);
3309     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3310     andl(cnt1,0x0000001F);  //tail count (in chars)
3311 
3312     bind(SCAN_TO_32_CHAR_LOOP);
3313     vmovdqu(vec3, Address(result, 0));
3314     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3315     vptest(vec2, vec3);
3316     jcc(Assembler::carryClear, FOUND_CHAR);
3317     addptr(result, 32);
3318     subl(tmp, stride*2);
3319     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3320     jmp(SCAN_TO_16_CHAR);
3321 
3322     bind(SCAN_TO_16_CHAR_INIT);
3323     movdl(vec1, ch);
3324     pxor(vec2, vec2);
3325     pshufb(vec1, vec2);
3326   }
3327 
3328   bind(SCAN_TO_16_CHAR);
3329   cmpl(cnt1, stride);
3330   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3331   if (UseAVX < 2) {
3332     movdl(vec1, ch);
3333     pxor(vec2, vec2);
3334     pshufb(vec1, vec2);
3335   }
3336   movl(tmp, cnt1);
3337   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3338   andl(cnt1,0x0000000F);  //tail count (in bytes)
3339 
3340   bind(SCAN_TO_16_CHAR_LOOP);
3341   movdqu(vec3, Address(result, 0));
3342   pcmpeqb(vec3, vec1);
3343   ptest(vec2, vec3);
3344   jcc(Assembler::carryClear, FOUND_CHAR);
3345   addptr(result, 16);
3346   subl(tmp, stride);
3347   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3348 
3349   bind(SCAN_TO_CHAR_INIT);
3350   testl(cnt1, cnt1);
3351   jcc(Assembler::zero, RET_NOT_FOUND);
3352   bind(SCAN_TO_CHAR_LOOP);
3353   load_unsigned_byte(tmp, Address(result, 0));
3354   cmpl(ch, tmp);
3355   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3356   addptr(result, 1);
3357   subl(cnt1, 1);
3358   jccb(Assembler::zero, RET_NOT_FOUND);
3359   jmp(SCAN_TO_CHAR_LOOP);
3360 
3361   bind(RET_NOT_FOUND);
3362   movl(result, -1);
3363   jmpb(DONE_LABEL);
3364 
3365   bind(FOUND_CHAR);
3366   if (UseAVX >= 2) {
3367     vpmovmskb(tmp, vec3);
3368   } else {
3369     pmovmskb(tmp, vec3);
3370   }
3371   bsfl(ch, tmp);
3372   addptr(result, ch);
3373 
3374   bind(FOUND_SEQ_CHAR);
3375   subptr(result, str1);
3376 
3377   bind(DONE_LABEL);
3378 } // stringL_indexof_char
3379 
3380 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3381   switch (eltype) {
3382   case T_BOOLEAN: return sizeof(jboolean);
3383   case T_BYTE:  return sizeof(jbyte);
3384   case T_SHORT: return sizeof(jshort);
3385   case T_CHAR:  return sizeof(jchar);
3386   case T_INT:   return sizeof(jint);
3387   default:
3388     ShouldNotReachHere();
3389     return -1;
3390   }
3391 }
3392 
3393 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3394   switch (eltype) {
3395   // T_BOOLEAN used as surrogate for unsigned byte
3396   case T_BOOLEAN: movzbl(dst, src);   break;
3397   case T_BYTE:    movsbl(dst, src);   break;
3398   case T_SHORT:   movswl(dst, src);   break;
3399   case T_CHAR:    movzwl(dst, src);   break;
3400   case T_INT:     movl(dst, src);     break;
3401   default:
3402     ShouldNotReachHere();
3403   }
3404 }
3405 
3406 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3407   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3408 }
3409 
3410 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3411   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3412 }
3413 
3414 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3415   const int vlen = Assembler::AVX_256bit;
3416   switch (eltype) {
3417   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3418   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3419   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3420   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3421   case T_INT:
3422     // do nothing
3423     break;
3424   default:
3425     ShouldNotReachHere();
3426   }
3427 }
3428 
3429 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3430                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3431                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3432                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3433                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3434                                         BasicType eltype) {
3435   ShortBranchVerifier sbv(this);
3436   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3437   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3438   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3439 
3440   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3441         SHORT_UNROLLED_LOOP_EXIT,
3442         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3443         UNROLLED_VECTOR_LOOP_BEGIN,
3444         END;
3445   switch (eltype) {
3446   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3447   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3448   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3449   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3450   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3451   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3452   }
3453 
3454   // For "renaming" for readibility of the code
3455   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3456                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3457                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3458 
3459   const int elsize = arrays_hashcode_elsize(eltype);
3460 
3461   /*
3462     if (cnt1 >= 2) {
3463       if (cnt1 >= 32) {
3464         UNROLLED VECTOR LOOP
3465       }
3466       UNROLLED SCALAR LOOP
3467     }
3468     SINGLE SCALAR
3469    */
3470 
3471   cmpl(cnt1, 32);
3472   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3473 
3474   // cnt1 >= 32 && generate_vectorized_loop
3475   xorl(index, index);
3476 
3477   // vresult = IntVector.zero(I256);
3478   for (int idx = 0; idx < 4; idx++) {
3479     vpxor(vresult[idx], vresult[idx]);
3480   }
3481   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3482   Register bound = tmp2;
3483   Register next = tmp3;
3484   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3485   movl(next, Address(tmp2, 0));
3486   movdl(vnext, next);
3487   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3488 
3489   // index = 0;
3490   // bound = cnt1 & ~(32 - 1);
3491   movl(bound, cnt1);
3492   andl(bound, ~(32 - 1));
3493   // for (; index < bound; index += 32) {
3494   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3495   // result *= next;
3496   imull(result, next);
3497   // loop fission to upfront the cost of fetching from memory, OOO execution
3498   // can then hopefully do a better job of prefetching
3499   for (int idx = 0; idx < 4; idx++) {
3500     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3501   }
3502   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3503   for (int idx = 0; idx < 4; idx++) {
3504     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3505     arrays_hashcode_elvcast(vtmp[idx], eltype);
3506     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3507   }
3508   // index += 32;
3509   addl(index, 32);
3510   // index < bound;
3511   cmpl(index, bound);
3512   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3513   // }
3514 
3515   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3516   subl(cnt1, bound);
3517   // release bound
3518 
3519   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3520   for (int idx = 0; idx < 4; idx++) {
3521     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3522     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3523     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3524   }
3525   // result += vresult.reduceLanes(ADD);
3526   for (int idx = 0; idx < 4; idx++) {
3527     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3528   }
3529 
3530   // } else if (cnt1 < 32) {
3531 
3532   bind(SHORT_UNROLLED_BEGIN);
3533   // int i = 1;
3534   movl(index, 1);
3535   cmpl(index, cnt1);
3536   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3537 
3538   // for (; i < cnt1 ; i += 2) {
3539   bind(SHORT_UNROLLED_LOOP_BEGIN);
3540   movl(tmp3, 961);
3541   imull(result, tmp3);
3542   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3543   movl(tmp3, tmp2);
3544   shll(tmp3, 5);
3545   subl(tmp3, tmp2);
3546   addl(result, tmp3);
3547   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3548   addl(result, tmp3);
3549   addl(index, 2);
3550   cmpl(index, cnt1);
3551   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3552 
3553   // }
3554   // if (i >= cnt1) {
3555   bind(SHORT_UNROLLED_LOOP_EXIT);
3556   jccb(Assembler::greater, END);
3557   movl(tmp2, result);
3558   shll(result, 5);
3559   subl(result, tmp2);
3560   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3561   addl(result, tmp3);
3562   // }
3563   bind(END);
3564 
3565   BLOCK_COMMENT("} // arrays_hashcode");
3566 
3567 } // arrays_hashcode
3568 
3569 // helper function for string_compare
3570 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3571                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3572                                            Address::ScaleFactor scale2, Register index, int ae) {
3573   if (ae == StrIntrinsicNode::LL) {
3574     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3575     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3576   } else if (ae == StrIntrinsicNode::UU) {
3577     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3578     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3579   } else {
3580     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3581     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3582   }
3583 }
3584 
3585 // Compare strings, used for char[] and byte[].
3586 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3587                                        Register cnt1, Register cnt2, Register result,
3588                                        XMMRegister vec1, int ae, KRegister mask) {
3589   ShortBranchVerifier sbv(this);
3590   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3591   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only AVX3
3592   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3593   int stride2x2 = 0x40;
3594   Address::ScaleFactor scale = Address::no_scale;
3595   Address::ScaleFactor scale1 = Address::no_scale;
3596   Address::ScaleFactor scale2 = Address::no_scale;
3597 
3598   if (ae != StrIntrinsicNode::LL) {
3599     stride2x2 = 0x20;
3600   }
3601 
3602   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3603     shrl(cnt2, 1);
3604   }
3605   // Compute the minimum of the string lengths and the
3606   // difference of the string lengths (stack).
3607   // Do the conditional move stuff
3608   movl(result, cnt1);
3609   subl(cnt1, cnt2);
3610   push(cnt1);
3611   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3612 
3613   // Is the minimum length zero?
3614   testl(cnt2, cnt2);
3615   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3616   if (ae == StrIntrinsicNode::LL) {
3617     // Load first bytes
3618     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3619     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3620   } else if (ae == StrIntrinsicNode::UU) {
3621     // Load first characters
3622     load_unsigned_short(result, Address(str1, 0));
3623     load_unsigned_short(cnt1, Address(str2, 0));
3624   } else {
3625     load_unsigned_byte(result, Address(str1, 0));
3626     load_unsigned_short(cnt1, Address(str2, 0));
3627   }
3628   subl(result, cnt1);
3629   jcc(Assembler::notZero,  POP_LABEL);
3630 
3631   if (ae == StrIntrinsicNode::UU) {
3632     // Divide length by 2 to get number of chars
3633     shrl(cnt2, 1);
3634   }
3635   cmpl(cnt2, 1);
3636   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3637 
3638   // Check if the strings start at the same location and setup scale and stride
3639   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3640     cmpptr(str1, str2);
3641     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3642     if (ae == StrIntrinsicNode::LL) {
3643       scale = Address::times_1;
3644       stride = 16;
3645     } else {
3646       scale = Address::times_2;
3647       stride = 8;
3648     }
3649   } else {
3650     scale1 = Address::times_1;
3651     scale2 = Address::times_2;
3652     // scale not used
3653     stride = 8;
3654   }
3655 
3656   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3657     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3658     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3659     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3660     Label COMPARE_TAIL_LONG;
3661     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only AVX3
3662 
3663     int pcmpmask = 0x19;
3664     if (ae == StrIntrinsicNode::LL) {
3665       pcmpmask &= ~0x01;
3666     }
3667 
3668     // Setup to compare 16-chars (32-bytes) vectors,
3669     // start from first character again because it has aligned address.
3670     if (ae == StrIntrinsicNode::LL) {
3671       stride2 = 32;
3672     } else {
3673       stride2 = 16;
3674     }
3675     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3676       adr_stride = stride << scale;
3677     } else {
3678       adr_stride1 = 8;  //stride << scale1;
3679       adr_stride2 = 16; //stride << scale2;
3680     }
3681 
3682     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3683     // rax and rdx are used by pcmpestri as elements counters
3684     movl(result, cnt2);
3685     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3686     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3687 
3688     // fast path : compare first 2 8-char vectors.
3689     bind(COMPARE_16_CHARS);
3690     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3691       movdqu(vec1, Address(str1, 0));
3692     } else {
3693       pmovzxbw(vec1, Address(str1, 0));
3694     }
3695     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3696     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3697 
3698     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3699       movdqu(vec1, Address(str1, adr_stride));
3700       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3701     } else {
3702       pmovzxbw(vec1, Address(str1, adr_stride1));
3703       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3704     }
3705     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3706     addl(cnt1, stride);
3707 
3708     // Compare the characters at index in cnt1
3709     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3710     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3711     subl(result, cnt2);
3712     jmp(POP_LABEL);
3713 
3714     // Setup the registers to start vector comparison loop
3715     bind(COMPARE_WIDE_VECTORS);
3716     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3717       lea(str1, Address(str1, result, scale));
3718       lea(str2, Address(str2, result, scale));
3719     } else {
3720       lea(str1, Address(str1, result, scale1));
3721       lea(str2, Address(str2, result, scale2));
3722     }
3723     subl(result, stride2);
3724     subl(cnt2, stride2);
3725     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3726     negptr(result);
3727 
3728     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3729     bind(COMPARE_WIDE_VECTORS_LOOP);
3730 
3731     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3732       cmpl(cnt2, stride2x2);
3733       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3734       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3735       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3736 
3737       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3738       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3739         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3740         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3741       } else {
3742         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3743         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3744       }
3745       kortestql(mask, mask);
3746       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3747       addptr(result, stride2x2);  // update since we already compared at this addr
3748       subl(cnt2, stride2x2);      // and sub the size too
3749       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3750 
3751       vpxor(vec1, vec1);
3752       jmpb(COMPARE_WIDE_TAIL);
3753     }//if (VM_Version::supports_avx512vlbw())
3754 
3755     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3756     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3757       vmovdqu(vec1, Address(str1, result, scale));
3758       vpxor(vec1, Address(str2, result, scale));
3759     } else {
3760       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3761       vpxor(vec1, Address(str2, result, scale2));
3762     }
3763     vptest(vec1, vec1);
3764     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3765     addptr(result, stride2);
3766     subl(cnt2, stride2);
3767     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3768     // clean upper bits of YMM registers
3769     vpxor(vec1, vec1);
3770 
3771     // compare wide vectors tail
3772     bind(COMPARE_WIDE_TAIL);
3773     testptr(result, result);
3774     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3775 
3776     movl(result, stride2);
3777     movl(cnt2, result);
3778     negptr(result);
3779     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3780 
3781     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3782     bind(VECTOR_NOT_EQUAL);
3783     // clean upper bits of YMM registers
3784     vpxor(vec1, vec1);
3785     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3786       lea(str1, Address(str1, result, scale));
3787       lea(str2, Address(str2, result, scale));
3788     } else {
3789       lea(str1, Address(str1, result, scale1));
3790       lea(str2, Address(str2, result, scale2));
3791     }
3792     jmp(COMPARE_16_CHARS);
3793 
3794     // Compare tail chars, length between 1 to 15 chars
3795     bind(COMPARE_TAIL_LONG);
3796     movl(cnt2, result);
3797     cmpl(cnt2, stride);
3798     jcc(Assembler::less, COMPARE_SMALL_STR);
3799 
3800     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3801       movdqu(vec1, Address(str1, 0));
3802     } else {
3803       pmovzxbw(vec1, Address(str1, 0));
3804     }
3805     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3806     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3807     subptr(cnt2, stride);
3808     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3809     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3810       lea(str1, Address(str1, result, scale));
3811       lea(str2, Address(str2, result, scale));
3812     } else {
3813       lea(str1, Address(str1, result, scale1));
3814       lea(str2, Address(str2, result, scale2));
3815     }
3816     negptr(cnt2);
3817     jmpb(WHILE_HEAD_LABEL);
3818 
3819     bind(COMPARE_SMALL_STR);
3820   } else if (UseSSE42Intrinsics) {
3821     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3822     int pcmpmask = 0x19;
3823     // Setup to compare 8-char (16-byte) vectors,
3824     // start from first character again because it has aligned address.
3825     movl(result, cnt2);
3826     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3827     if (ae == StrIntrinsicNode::LL) {
3828       pcmpmask &= ~0x01;
3829     }
3830     jcc(Assembler::zero, COMPARE_TAIL);
3831     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3832       lea(str1, Address(str1, result, scale));
3833       lea(str2, Address(str2, result, scale));
3834     } else {
3835       lea(str1, Address(str1, result, scale1));
3836       lea(str2, Address(str2, result, scale2));
3837     }
3838     negptr(result);
3839 
3840     // pcmpestri
3841     //   inputs:
3842     //     vec1- substring
3843     //     rax - negative string length (elements count)
3844     //     mem - scanned string
3845     //     rdx - string length (elements count)
3846     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3847     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3848     //   outputs:
3849     //     rcx - first mismatched element index
3850     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3851 
3852     bind(COMPARE_WIDE_VECTORS);
3853     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3854       movdqu(vec1, Address(str1, result, scale));
3855       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3856     } else {
3857       pmovzxbw(vec1, Address(str1, result, scale1));
3858       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3859     }
3860     // After pcmpestri cnt1(rcx) contains mismatched element index
3861 
3862     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3863     addptr(result, stride);
3864     subptr(cnt2, stride);
3865     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3866 
3867     // compare wide vectors tail
3868     testptr(result, result);
3869     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3870 
3871     movl(cnt2, stride);
3872     movl(result, stride);
3873     negptr(result);
3874     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3875       movdqu(vec1, Address(str1, result, scale));
3876       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3877     } else {
3878       pmovzxbw(vec1, Address(str1, result, scale1));
3879       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3880     }
3881     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3882 
3883     // Mismatched characters in the vectors
3884     bind(VECTOR_NOT_EQUAL);
3885     addptr(cnt1, result);
3886     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3887     subl(result, cnt2);
3888     jmpb(POP_LABEL);
3889 
3890     bind(COMPARE_TAIL); // limit is zero
3891     movl(cnt2, result);
3892     // Fallthru to tail compare
3893   }
3894   // Shift str2 and str1 to the end of the arrays, negate min
3895   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3896     lea(str1, Address(str1, cnt2, scale));
3897     lea(str2, Address(str2, cnt2, scale));
3898   } else {
3899     lea(str1, Address(str1, cnt2, scale1));
3900     lea(str2, Address(str2, cnt2, scale2));
3901   }
3902   decrementl(cnt2);  // first character was compared already
3903   negptr(cnt2);
3904 
3905   // Compare the rest of the elements
3906   bind(WHILE_HEAD_LABEL);
3907   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3908   subl(result, cnt1);
3909   jccb(Assembler::notZero, POP_LABEL);
3910   increment(cnt2);
3911   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3912 
3913   // Strings are equal up to min length.  Return the length difference.
3914   bind(LENGTH_DIFF_LABEL);
3915   pop(result);
3916   if (ae == StrIntrinsicNode::UU) {
3917     // Divide diff by 2 to get number of chars
3918     sarl(result, 1);
3919   }
3920   jmpb(DONE_LABEL);
3921 
3922   if (VM_Version::supports_avx512vlbw()) {
3923 
3924     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3925 
3926     kmovql(cnt1, mask);
3927     notq(cnt1);
3928     bsfq(cnt2, cnt1);
3929     if (ae != StrIntrinsicNode::LL) {
3930       // Divide diff by 2 to get number of chars
3931       sarl(cnt2, 1);
3932     }
3933     addq(result, cnt2);
3934     if (ae == StrIntrinsicNode::LL) {
3935       load_unsigned_byte(cnt1, Address(str2, result));
3936       load_unsigned_byte(result, Address(str1, result));
3937     } else if (ae == StrIntrinsicNode::UU) {
3938       load_unsigned_short(cnt1, Address(str2, result, scale));
3939       load_unsigned_short(result, Address(str1, result, scale));
3940     } else {
3941       load_unsigned_short(cnt1, Address(str2, result, scale2));
3942       load_unsigned_byte(result, Address(str1, result, scale1));
3943     }
3944     subl(result, cnt1);
3945     jmpb(POP_LABEL);
3946   }//if (VM_Version::supports_avx512vlbw())
3947 
3948   // Discard the stored length difference
3949   bind(POP_LABEL);
3950   pop(cnt1);
3951 
3952   // That's it
3953   bind(DONE_LABEL);
3954   if(ae == StrIntrinsicNode::UL) {
3955     negl(result);
3956   }
3957 
3958 }
3959 
3960 // Search for Non-ASCII character (Negative byte value) in a byte array,
3961 // return the index of the first such character, otherwise the length
3962 // of the array segment searched.
3963 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3964 //   @IntrinsicCandidate
3965 //   public static int countPositives(byte[] ba, int off, int len) {
3966 //     for (int i = off; i < off + len; i++) {
3967 //       if (ba[i] < 0) {
3968 //         return i - off;
3969 //       }
3970 //     }
3971 //     return len;
3972 //   }
3973 void C2_MacroAssembler::count_positives(Register ary1, Register len,
3974   Register result, Register tmp1,
3975   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3976   // rsi: byte array
3977   // rcx: len
3978   // rax: result
3979   ShortBranchVerifier sbv(this);
3980   assert_different_registers(ary1, len, result, tmp1);
3981   assert_different_registers(vec1, vec2);
3982   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3983 
3984   movl(result, len); // copy
3985   // len == 0
3986   testl(len, len);
3987   jcc(Assembler::zero, DONE);
3988 
3989   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3990     VM_Version::supports_avx512vlbw() &&
3991     VM_Version::supports_bmi2()) {
3992 
3993     Label test_64_loop, test_tail, BREAK_LOOP;
3994     movl(tmp1, len);
3995     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3996 
3997     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
3998     andl(len,  0xffffffc0); // vector count (in chars)
3999     jccb(Assembler::zero, test_tail);
4000 
4001     lea(ary1, Address(ary1, len, Address::times_1));
4002     negptr(len);
4003 
4004     bind(test_64_loop);
4005     // Check whether our 64 elements of size byte contain negatives
4006     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4007     kortestql(mask1, mask1);
4008     jcc(Assembler::notZero, BREAK_LOOP);
4009 
4010     addptr(len, 64);
4011     jccb(Assembler::notZero, test_64_loop);
4012 
4013     bind(test_tail);
4014     // bail out when there is nothing to be done
4015     testl(tmp1, -1);
4016     jcc(Assembler::zero, DONE);
4017 
4018 
4019     // check the tail for absense of negatives
4020     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4021     {
4022       Register tmp3_aliased = len;
4023       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4024       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4025       notq(tmp3_aliased);
4026       kmovql(mask2, tmp3_aliased);
4027     }
4028 
4029     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4030     ktestq(mask1, mask2);
4031     jcc(Assembler::zero, DONE);
4032 
4033     // do a full check for negative registers in the tail
4034     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4035                      // ary1 already pointing to the right place
4036     jmpb(TAIL_START);
4037 
4038     bind(BREAK_LOOP);
4039     // At least one byte in the last 64 byte block was negative.
4040     // Set up to look at the last 64 bytes as if they were a tail
4041     lea(ary1, Address(ary1, len, Address::times_1));
4042     addptr(result, len);
4043     // Ignore the very last byte: if all others are positive,
4044     // it must be negative, so we can skip right to the 2+1 byte
4045     // end comparison at this point
4046     orl(result, 63);
4047     movl(len, 63);
4048     // Fallthru to tail compare
4049   } else {
4050 
4051     if (UseAVX >= 2) {
4052       // With AVX2, use 32-byte vector compare
4053       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4054 
4055       // Compare 32-byte vectors
4056       testl(len, 0xffffffe0);   // vector count (in bytes)
4057       jccb(Assembler::zero, TAIL_START);
4058 
4059       andl(len, 0xffffffe0);
4060       lea(ary1, Address(ary1, len, Address::times_1));
4061       negptr(len);
4062 
4063       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4064       movdl(vec2, tmp1);
4065       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4066 
4067       bind(COMPARE_WIDE_VECTORS);
4068       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4069       vptest(vec1, vec2);
4070       jccb(Assembler::notZero, BREAK_LOOP);
4071       addptr(len, 32);
4072       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4073 
4074       testl(result, 0x0000001f);   // any bytes remaining?
4075       jcc(Assembler::zero, DONE);
4076 
4077       // Quick test using the already prepared vector mask
4078       movl(len, result);
4079       andl(len, 0x0000001f);
4080       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4081       vptest(vec1, vec2);
4082       jcc(Assembler::zero, DONE);
4083       // There are zeros, jump to the tail to determine exactly where
4084       jmpb(TAIL_START);
4085 
4086       bind(BREAK_LOOP);
4087       // At least one byte in the last 32-byte vector is negative.
4088       // Set up to look at the last 32 bytes as if they were a tail
4089       lea(ary1, Address(ary1, len, Address::times_1));
4090       addptr(result, len);
4091       // Ignore the very last byte: if all others are positive,
4092       // it must be negative, so we can skip right to the 2+1 byte
4093       // end comparison at this point
4094       orl(result, 31);
4095       movl(len, 31);
4096       // Fallthru to tail compare
4097     } else if (UseSSE42Intrinsics) {
4098       // With SSE4.2, use double quad vector compare
4099       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4100 
4101       // Compare 16-byte vectors
4102       testl(len, 0xfffffff0);   // vector count (in bytes)
4103       jcc(Assembler::zero, TAIL_START);
4104 
4105       andl(len, 0xfffffff0);
4106       lea(ary1, Address(ary1, len, Address::times_1));
4107       negptr(len);
4108 
4109       movl(tmp1, 0x80808080);
4110       movdl(vec2, tmp1);
4111       pshufd(vec2, vec2, 0);
4112 
4113       bind(COMPARE_WIDE_VECTORS);
4114       movdqu(vec1, Address(ary1, len, Address::times_1));
4115       ptest(vec1, vec2);
4116       jccb(Assembler::notZero, BREAK_LOOP);
4117       addptr(len, 16);
4118       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4119 
4120       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4121       jcc(Assembler::zero, DONE);
4122 
4123       // Quick test using the already prepared vector mask
4124       movl(len, result);
4125       andl(len, 0x0000000f);   // tail count (in bytes)
4126       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4127       ptest(vec1, vec2);
4128       jcc(Assembler::zero, DONE);
4129       jmpb(TAIL_START);
4130 
4131       bind(BREAK_LOOP);
4132       // At least one byte in the last 16-byte vector is negative.
4133       // Set up and look at the last 16 bytes as if they were a tail
4134       lea(ary1, Address(ary1, len, Address::times_1));
4135       addptr(result, len);
4136       // Ignore the very last byte: if all others are positive,
4137       // it must be negative, so we can skip right to the 2+1 byte
4138       // end comparison at this point
4139       orl(result, 15);
4140       movl(len, 15);
4141       // Fallthru to tail compare
4142     }
4143   }
4144 
4145   bind(TAIL_START);
4146   // Compare 4-byte vectors
4147   andl(len, 0xfffffffc); // vector count (in bytes)
4148   jccb(Assembler::zero, COMPARE_CHAR);
4149 
4150   lea(ary1, Address(ary1, len, Address::times_1));
4151   negptr(len);
4152 
4153   bind(COMPARE_VECTORS);
4154   movl(tmp1, Address(ary1, len, Address::times_1));
4155   andl(tmp1, 0x80808080);
4156   jccb(Assembler::notZero, TAIL_ADJUST);
4157   addptr(len, 4);
4158   jccb(Assembler::notZero, COMPARE_VECTORS);
4159 
4160   // Compare trailing char (final 2-3 bytes), if any
4161   bind(COMPARE_CHAR);
4162 
4163   testl(result, 0x2);   // tail  char
4164   jccb(Assembler::zero, COMPARE_BYTE);
4165   load_unsigned_short(tmp1, Address(ary1, 0));
4166   andl(tmp1, 0x00008080);
4167   jccb(Assembler::notZero, CHAR_ADJUST);
4168   lea(ary1, Address(ary1, 2));
4169 
4170   bind(COMPARE_BYTE);
4171   testl(result, 0x1);   // tail  byte
4172   jccb(Assembler::zero, DONE);
4173   load_unsigned_byte(tmp1, Address(ary1, 0));
4174   testl(tmp1, 0x00000080);
4175   jccb(Assembler::zero, DONE);
4176   subptr(result, 1);
4177   jmpb(DONE);
4178 
4179   bind(TAIL_ADJUST);
4180   // there are negative bits in the last 4 byte block.
4181   // Adjust result and check the next three bytes
4182   addptr(result, len);
4183   orl(result, 3);
4184   lea(ary1, Address(ary1, len, Address::times_1));
4185   jmpb(COMPARE_CHAR);
4186 
4187   bind(CHAR_ADJUST);
4188   // We are looking at a char + optional byte tail, and found that one
4189   // of the bytes in the char is negative. Adjust the result, check the
4190   // first byte and readjust if needed.
4191   andl(result, 0xfffffffc);
4192   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4193   jccb(Assembler::notZero, DONE);
4194   addptr(result, 1);
4195 
4196   // That's it
4197   bind(DONE);
4198   if (UseAVX >= 2) {
4199     // clean upper bits of YMM registers
4200     vpxor(vec1, vec1);
4201     vpxor(vec2, vec2);
4202   }
4203 }
4204 
4205 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4206 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4207                                       Register limit, Register result, Register chr,
4208                                       XMMRegister vec1, XMMRegister vec2, bool is_char,
4209                                       KRegister mask, bool expand_ary2) {
4210   // for expand_ary2, limit is the (smaller) size of the second array.
4211   ShortBranchVerifier sbv(this);
4212   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4213 
4214   assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4215          "Expansion only implemented for AVX2");
4216 
4217   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4218   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4219 
4220   Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4221   int scaleIncr = expand_ary2 ? 8 : 16;
4222 
4223   if (is_array_equ) {
4224     // Check the input args
4225     cmpoop(ary1, ary2);
4226     jcc(Assembler::equal, TRUE_LABEL);
4227 
4228     // Need additional checks for arrays_equals.
4229     testptr(ary1, ary1);
4230     jcc(Assembler::zero, FALSE_LABEL);
4231     testptr(ary2, ary2);
4232     jcc(Assembler::zero, FALSE_LABEL);
4233 
4234     // Check the lengths
4235     movl(limit, Address(ary1, length_offset));
4236     cmpl(limit, Address(ary2, length_offset));
4237     jcc(Assembler::notEqual, FALSE_LABEL);
4238   }
4239 
4240   // count == 0
4241   testl(limit, limit);
4242   jcc(Assembler::zero, TRUE_LABEL);
4243 
4244   if (is_array_equ) {
4245     // Load array address
4246     lea(ary1, Address(ary1, base_offset));
4247     lea(ary2, Address(ary2, base_offset));
4248   }
4249 
4250   if (is_array_equ && is_char) {
4251     // arrays_equals when used for char[].
4252     shll(limit, 1);      // byte count != 0
4253   }
4254   movl(result, limit); // copy
4255 
4256   if (UseAVX >= 2) {
4257     // With AVX2, use 32-byte vector compare
4258     Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4259 
4260     // Compare 32-byte vectors
4261     if (expand_ary2) {
4262       andl(result, 0x0000000f);  //   tail count (in bytes)
4263       andl(limit, 0xfffffff0);   // vector count (in bytes)
4264       jcc(Assembler::zero, COMPARE_TAIL);
4265     } else {
4266       andl(result, 0x0000001f);  //   tail count (in bytes)
4267       andl(limit, 0xffffffe0);   // vector count (in bytes)
4268       jcc(Assembler::zero, COMPARE_TAIL_16);
4269     }
4270 
4271     lea(ary1, Address(ary1, limit, scaleFactor));
4272     lea(ary2, Address(ary2, limit, Address::times_1));
4273     negptr(limit);
4274 
4275     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4276       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4277 
4278       cmpl(limit, -64);
4279       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4280 
4281       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4282 
4283       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4284       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4285       kortestql(mask, mask);
4286       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4287       addptr(limit, 64);  // update since we already compared at this addr
4288       cmpl(limit, -64);
4289       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4290 
4291       // At this point we may still need to compare -limit+result bytes.
4292       // We could execute the next two instruction and just continue via non-wide path:
4293       //  cmpl(limit, 0);
4294       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4295       // But since we stopped at the points ary{1,2}+limit which are
4296       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4297       // (|limit| <= 32 and result < 32),
4298       // we may just compare the last 64 bytes.
4299       //
4300       addptr(result, -64);   // it is safe, bc we just came from this area
4301       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4302       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4303       kortestql(mask, mask);
4304       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4305 
4306       jmp(TRUE_LABEL);
4307 
4308       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4309 
4310     }//if (VM_Version::supports_avx512vlbw())
4311 
4312     bind(COMPARE_WIDE_VECTORS);
4313     vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4314     if (expand_ary2) {
4315       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4316     } else {
4317       vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4318     }
4319     vpxor(vec1, vec2);
4320 
4321     vptest(vec1, vec1);
4322     jcc(Assembler::notZero, FALSE_LABEL);
4323     addptr(limit, scaleIncr * 2);
4324     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4325 
4326     testl(result, result);
4327     jcc(Assembler::zero, TRUE_LABEL);
4328 
4329     vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4330     if (expand_ary2) {
4331       vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4332     } else {
4333       vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4334     }
4335     vpxor(vec1, vec2);
4336 
4337     vptest(vec1, vec1);
4338     jcc(Assembler::notZero, FALSE_LABEL);
4339     jmp(TRUE_LABEL);
4340 
4341     bind(COMPARE_TAIL_16); // limit is zero
4342     movl(limit, result);
4343 
4344     // Compare 16-byte chunks
4345     andl(result, 0x0000000f);  //   tail count (in bytes)
4346     andl(limit, 0xfffffff0);   // vector count (in bytes)
4347     jcc(Assembler::zero, COMPARE_TAIL);
4348 
4349     lea(ary1, Address(ary1, limit, scaleFactor));
4350     lea(ary2, Address(ary2, limit, Address::times_1));
4351     negptr(limit);
4352 
4353     bind(COMPARE_WIDE_VECTORS_16);
4354     movdqu(vec1, Address(ary1, limit, scaleFactor));
4355     if (expand_ary2) {
4356       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4357     } else {
4358       movdqu(vec2, Address(ary2, limit, Address::times_1));
4359     }
4360     pxor(vec1, vec2);
4361 
4362     ptest(vec1, vec1);
4363     jcc(Assembler::notZero, FALSE_LABEL);
4364     addptr(limit, scaleIncr);
4365     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4366 
4367     bind(COMPARE_TAIL); // limit is zero
4368     movl(limit, result);
4369     // Fallthru to tail compare
4370   } else if (UseSSE42Intrinsics) {
4371     // With SSE4.2, use double quad vector compare
4372     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4373 
4374     // Compare 16-byte vectors
4375     andl(result, 0x0000000f);  //   tail count (in bytes)
4376     andl(limit, 0xfffffff0);   // vector count (in bytes)
4377     jcc(Assembler::zero, COMPARE_TAIL);
4378 
4379     lea(ary1, Address(ary1, limit, Address::times_1));
4380     lea(ary2, Address(ary2, limit, Address::times_1));
4381     negptr(limit);
4382 
4383     bind(COMPARE_WIDE_VECTORS);
4384     movdqu(vec1, Address(ary1, limit, Address::times_1));
4385     movdqu(vec2, Address(ary2, limit, Address::times_1));
4386     pxor(vec1, vec2);
4387 
4388     ptest(vec1, vec1);
4389     jcc(Assembler::notZero, FALSE_LABEL);
4390     addptr(limit, 16);
4391     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4392 
4393     testl(result, result);
4394     jcc(Assembler::zero, TRUE_LABEL);
4395 
4396     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4397     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4398     pxor(vec1, vec2);
4399 
4400     ptest(vec1, vec1);
4401     jccb(Assembler::notZero, FALSE_LABEL);
4402     jmpb(TRUE_LABEL);
4403 
4404     bind(COMPARE_TAIL); // limit is zero
4405     movl(limit, result);
4406     // Fallthru to tail compare
4407   }
4408 
4409   // Compare 4-byte vectors
4410   if (expand_ary2) {
4411     testl(result, result);
4412     jccb(Assembler::zero, TRUE_LABEL);
4413   } else {
4414     andl(limit, 0xfffffffc); // vector count (in bytes)
4415     jccb(Assembler::zero, COMPARE_CHAR);
4416   }
4417 
4418   lea(ary1, Address(ary1, limit, scaleFactor));
4419   lea(ary2, Address(ary2, limit, Address::times_1));
4420   negptr(limit);
4421 
4422   bind(COMPARE_VECTORS);
4423   if (expand_ary2) {
4424     // There are no "vector" operations for bytes to shorts
4425     movzbl(chr, Address(ary2, limit, Address::times_1));
4426     cmpw(Address(ary1, limit, Address::times_2), chr);
4427     jccb(Assembler::notEqual, FALSE_LABEL);
4428     addptr(limit, 1);
4429     jcc(Assembler::notZero, COMPARE_VECTORS);
4430     jmp(TRUE_LABEL);
4431   } else {
4432     movl(chr, Address(ary1, limit, Address::times_1));
4433     cmpl(chr, Address(ary2, limit, Address::times_1));
4434     jccb(Assembler::notEqual, FALSE_LABEL);
4435     addptr(limit, 4);
4436     jcc(Assembler::notZero, COMPARE_VECTORS);
4437   }
4438 
4439   // Compare trailing char (final 2 bytes), if any
4440   bind(COMPARE_CHAR);
4441   testl(result, 0x2);   // tail  char
4442   jccb(Assembler::zero, COMPARE_BYTE);
4443   load_unsigned_short(chr, Address(ary1, 0));
4444   load_unsigned_short(limit, Address(ary2, 0));
4445   cmpl(chr, limit);
4446   jccb(Assembler::notEqual, FALSE_LABEL);
4447 
4448   if (is_array_equ && is_char) {
4449     bind(COMPARE_BYTE);
4450   } else {
4451     lea(ary1, Address(ary1, 2));
4452     lea(ary2, Address(ary2, 2));
4453 
4454     bind(COMPARE_BYTE);
4455     testl(result, 0x1);   // tail  byte
4456     jccb(Assembler::zero, TRUE_LABEL);
4457     load_unsigned_byte(chr, Address(ary1, 0));
4458     load_unsigned_byte(limit, Address(ary2, 0));
4459     cmpl(chr, limit);
4460     jccb(Assembler::notEqual, FALSE_LABEL);
4461   }
4462   bind(TRUE_LABEL);
4463   movl(result, 1);   // return true
4464   jmpb(DONE);
4465 
4466   bind(FALSE_LABEL);
4467   xorl(result, result); // return false
4468 
4469   // That's it
4470   bind(DONE);
4471   if (UseAVX >= 2) {
4472     // clean upper bits of YMM registers
4473     vpxor(vec1, vec1);
4474     vpxor(vec2, vec2);
4475   }
4476 }
4477 
4478 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4479 #define __ masm.
4480   Register dst = stub.data<0>();
4481   XMMRegister src = stub.data<1>();
4482   address target = stub.data<2>();
4483   __ bind(stub.entry());
4484   __ subptr(rsp, 8);
4485   __ movdbl(Address(rsp), src);
4486   __ call(RuntimeAddress(target));
4487   // APX REX2 encoding for pop(dst) increases the stub size by 1 byte.
4488   __ pop(dst);
4489   __ jmp(stub.continuation());
4490 #undef __
4491 }
4492 
4493 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4494   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4495   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4496 
4497   address slowpath_target;
4498   if (dst_bt == T_INT) {
4499     if (src_bt == T_FLOAT) {
4500       cvttss2sil(dst, src);
4501       cmpl(dst, 0x80000000);
4502       slowpath_target = StubRoutines::x86::f2i_fixup();
4503     } else {
4504       cvttsd2sil(dst, src);
4505       cmpl(dst, 0x80000000);
4506       slowpath_target = StubRoutines::x86::d2i_fixup();
4507     }
4508   } else {
4509     if (src_bt == T_FLOAT) {
4510       cvttss2siq(dst, src);
4511       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4512       slowpath_target = StubRoutines::x86::f2l_fixup();
4513     } else {
4514       cvttsd2siq(dst, src);
4515       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4516       slowpath_target = StubRoutines::x86::d2l_fixup();
4517     }
4518   }
4519 
4520   // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte.
4521   int max_size = 23 + (UseAPX ? 1 : 0);
4522   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath);
4523   jcc(Assembler::equal, stub->entry());
4524   bind(stub->continuation());
4525 }
4526 
4527 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4528                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4529   switch(ideal_opc) {
4530     case Op_LShiftVS:
4531       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4532     case Op_LShiftVI:
4533       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4534     case Op_LShiftVL:
4535       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4536     case Op_RShiftVS:
4537       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4538     case Op_RShiftVI:
4539       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4540     case Op_RShiftVL:
4541       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4542     case Op_URShiftVS:
4543       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4544     case Op_URShiftVI:
4545       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4546     case Op_URShiftVL:
4547       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4548     case Op_RotateRightV:
4549       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4550     case Op_RotateLeftV:
4551       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4552     default:
4553       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4554       break;
4555   }
4556 }
4557 
4558 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4559                                                XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4560   if (is_unsigned) {
4561     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4562   } else {
4563     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4564   }
4565 }
4566 
4567 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4568                                                       XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4569   switch (elem_bt) {
4570     case T_BYTE:
4571       if (ideal_opc == Op_SaturatingAddV) {
4572         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4573       } else {
4574         assert(ideal_opc == Op_SaturatingSubV, "");
4575         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4576       }
4577       break;
4578     case T_SHORT:
4579       if (ideal_opc == Op_SaturatingAddV) {
4580         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4581       } else {
4582         assert(ideal_opc == Op_SaturatingSubV, "");
4583         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4584       }
4585       break;
4586     default:
4587       fatal("Unsupported type %s", type2name(elem_bt));
4588       break;
4589   }
4590 }
4591 
4592 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4593                                                         XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4594   switch (elem_bt) {
4595     case T_BYTE:
4596       if (ideal_opc == Op_SaturatingAddV) {
4597         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4598       } else {
4599         assert(ideal_opc == Op_SaturatingSubV, "");
4600         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4601       }
4602       break;
4603     case T_SHORT:
4604       if (ideal_opc == Op_SaturatingAddV) {
4605         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4606       } else {
4607         assert(ideal_opc == Op_SaturatingSubV, "");
4608         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4609       }
4610       break;
4611     default:
4612       fatal("Unsupported type %s", type2name(elem_bt));
4613       break;
4614   }
4615 }
4616 
4617 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4618                                                Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4619   if (is_unsigned) {
4620     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4621   } else {
4622     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4623   }
4624 }
4625 
4626 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4627                                                       XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4628   switch (elem_bt) {
4629     case T_BYTE:
4630       if (ideal_opc == Op_SaturatingAddV) {
4631         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4632       } else {
4633         assert(ideal_opc == Op_SaturatingSubV, "");
4634         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4635       }
4636       break;
4637     case T_SHORT:
4638       if (ideal_opc == Op_SaturatingAddV) {
4639         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4640       } else {
4641         assert(ideal_opc == Op_SaturatingSubV, "");
4642         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4643       }
4644       break;
4645     default:
4646       fatal("Unsupported type %s", type2name(elem_bt));
4647       break;
4648   }
4649 }
4650 
4651 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4652                                                         XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4653   switch (elem_bt) {
4654     case T_BYTE:
4655       if (ideal_opc == Op_SaturatingAddV) {
4656         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4657       } else {
4658         assert(ideal_opc == Op_SaturatingSubV, "");
4659         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4660       }
4661       break;
4662     case T_SHORT:
4663       if (ideal_opc == Op_SaturatingAddV) {
4664         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4665       } else {
4666         assert(ideal_opc == Op_SaturatingSubV, "");
4667         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4668       }
4669       break;
4670     default:
4671       fatal("Unsupported type %s", type2name(elem_bt));
4672       break;
4673   }
4674 }
4675 
4676 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4677                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4678                                     bool is_varshift) {
4679   switch (ideal_opc) {
4680     case Op_AddVB:
4681       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4682     case Op_AddVS:
4683       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4684     case Op_AddVI:
4685       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4686     case Op_AddVL:
4687       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4688     case Op_AddVF:
4689       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4690     case Op_AddVD:
4691       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4692     case Op_SubVB:
4693       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4694     case Op_SubVS:
4695       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4696     case Op_SubVI:
4697       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4698     case Op_SubVL:
4699       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4700     case Op_SubVF:
4701       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4702     case Op_SubVD:
4703       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4704     case Op_MulVS:
4705       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4706     case Op_MulVI:
4707       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4708     case Op_MulVL:
4709       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4710     case Op_MulVF:
4711       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4712     case Op_MulVD:
4713       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4714     case Op_DivVF:
4715       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4716     case Op_DivVD:
4717       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4718     case Op_SqrtVF:
4719       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4720     case Op_SqrtVD:
4721       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4722     case Op_AbsVB:
4723       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4724     case Op_AbsVS:
4725       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4726     case Op_AbsVI:
4727       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4728     case Op_AbsVL:
4729       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4730     case Op_FmaVF:
4731       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4732     case Op_FmaVD:
4733       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4734     case Op_VectorRearrange:
4735       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4736     case Op_LShiftVS:
4737       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4738     case Op_LShiftVI:
4739       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4740     case Op_LShiftVL:
4741       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4742     case Op_RShiftVS:
4743       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4744     case Op_RShiftVI:
4745       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4746     case Op_RShiftVL:
4747       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4748     case Op_URShiftVS:
4749       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4750     case Op_URShiftVI:
4751       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4752     case Op_URShiftVL:
4753       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4754     case Op_RotateLeftV:
4755       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4756     case Op_RotateRightV:
4757       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4758     case Op_MaxV:
4759       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4760     case Op_MinV:
4761       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4762     case Op_UMinV:
4763       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4764     case Op_UMaxV:
4765       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4766     case Op_XorV:
4767       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4768     case Op_OrV:
4769       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4770     case Op_AndV:
4771       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4772     default:
4773       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4774       break;
4775   }
4776 }
4777 
4778 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4779                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4780   switch (ideal_opc) {
4781     case Op_AddVB:
4782       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4783     case Op_AddVS:
4784       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4785     case Op_AddVI:
4786       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4787     case Op_AddVL:
4788       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4789     case Op_AddVF:
4790       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4791     case Op_AddVD:
4792       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4793     case Op_SubVB:
4794       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4795     case Op_SubVS:
4796       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4797     case Op_SubVI:
4798       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4799     case Op_SubVL:
4800       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4801     case Op_SubVF:
4802       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4803     case Op_SubVD:
4804       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4805     case Op_MulVS:
4806       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4807     case Op_MulVI:
4808       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4809     case Op_MulVL:
4810       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4811     case Op_MulVF:
4812       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4813     case Op_MulVD:
4814       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4815     case Op_DivVF:
4816       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4817     case Op_DivVD:
4818       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4819     case Op_FmaVF:
4820       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4821     case Op_FmaVD:
4822       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4823     case Op_MaxV:
4824       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4825     case Op_MinV:
4826       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4827     case Op_UMaxV:
4828       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4829     case Op_UMinV:
4830       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4831     case Op_XorV:
4832       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4833     case Op_OrV:
4834       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4835     case Op_AndV:
4836       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4837     default:
4838       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4839       break;
4840   }
4841 }
4842 
4843 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4844                                   KRegister src1, KRegister src2) {
4845   BasicType etype = T_ILLEGAL;
4846   switch(mask_len) {
4847     case 2:
4848     case 4:
4849     case 8:  etype = T_BYTE; break;
4850     case 16: etype = T_SHORT; break;
4851     case 32: etype = T_INT; break;
4852     case 64: etype = T_LONG; break;
4853     default: fatal("Unsupported type"); break;
4854   }
4855   assert(etype != T_ILLEGAL, "");
4856   switch(ideal_opc) {
4857     case Op_AndVMask:
4858       kand(etype, dst, src1, src2); break;
4859     case Op_OrVMask:
4860       kor(etype, dst, src1, src2); break;
4861     case Op_XorVMask:
4862       kxor(etype, dst, src1, src2); break;
4863     default:
4864       fatal("Unsupported masked operation"); break;
4865   }
4866 }
4867 
4868 /*
4869  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4870  * If src is NaN, the result is 0.
4871  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4872  * the result is equal to the value of Integer.MIN_VALUE.
4873  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4874  * the result is equal to the value of Integer.MAX_VALUE.
4875  */
4876 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4877                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4878                                                                    Register rscratch, AddressLiteral float_sign_flip,
4879                                                                    int vec_enc) {
4880   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4881   Label done;
4882   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4883   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4884   vptest(xtmp2, xtmp2, vec_enc);
4885   jccb(Assembler::equal, done);
4886 
4887   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4888   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4889 
4890   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4891   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4892   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4893 
4894   // Recompute the mask for remaining special value.
4895   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4896   // Extract SRC values corresponding to TRUE mask lanes.
4897   vpand(xtmp4, xtmp2, src, vec_enc);
4898   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4899   // values are set.
4900   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4901 
4902   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4903   bind(done);
4904 }
4905 
4906 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4907                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4908                                                                     Register rscratch, AddressLiteral float_sign_flip,
4909                                                                     int vec_enc) {
4910   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4911   Label done;
4912   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4913   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4914   kortestwl(ktmp1, ktmp1);
4915   jccb(Assembler::equal, done);
4916 
4917   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4918   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4919   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4920 
4921   kxorwl(ktmp1, ktmp1, ktmp2);
4922   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4923   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4924   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4925   bind(done);
4926 }
4927 
4928 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4929                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4930                                                                      Register rscratch, AddressLiteral double_sign_flip,
4931                                                                      int vec_enc) {
4932   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4933 
4934   Label done;
4935   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4936   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4937   kortestwl(ktmp1, ktmp1);
4938   jccb(Assembler::equal, done);
4939 
4940   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4941   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4942   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4943 
4944   kxorwl(ktmp1, ktmp1, ktmp2);
4945   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4946   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4947   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4948   bind(done);
4949 }
4950 
4951 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4952                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4953                                                                      Register rscratch, AddressLiteral float_sign_flip,
4954                                                                      int vec_enc) {
4955   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4956   Label done;
4957   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4958   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4959   kortestwl(ktmp1, ktmp1);
4960   jccb(Assembler::equal, done);
4961 
4962   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4963   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4964   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4965 
4966   kxorwl(ktmp1, ktmp1, ktmp2);
4967   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4968   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4969   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4970   bind(done);
4971 }
4972 
4973 /*
4974  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4975  * If src is NaN, the result is 0.
4976  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
4977  * the result is equal to the value of Long.MIN_VALUE.
4978  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
4979  * the result is equal to the value of Long.MAX_VALUE.
4980  */
4981 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4982                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4983                                                                       Register rscratch, AddressLiteral double_sign_flip,
4984                                                                       int vec_enc) {
4985   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4986 
4987   Label done;
4988   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4989   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
4990   kortestwl(ktmp1, ktmp1);
4991   jccb(Assembler::equal, done);
4992 
4993   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4994   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4995   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4996 
4997   kxorwl(ktmp1, ktmp1, ktmp2);
4998   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4999   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5000   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5001   bind(done);
5002 }
5003 
5004 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5005                                                              XMMRegister xtmp, int index, int vec_enc) {
5006    assert(vec_enc < Assembler::AVX_512bit, "");
5007    if (vec_enc == Assembler::AVX_256bit) {
5008      vextractf128_high(xtmp, src);
5009      vshufps(dst, src, xtmp, index, vec_enc);
5010    } else {
5011      vshufps(dst, src, zero, index, vec_enc);
5012    }
5013 }
5014 
5015 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5016                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5017                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
5018   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5019 
5020   Label done;
5021   // Compare the destination lanes with float_sign_flip
5022   // value to get mask for all special values.
5023   movdqu(xtmp1, float_sign_flip, rscratch);
5024   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5025   ptest(xtmp2, xtmp2);
5026   jccb(Assembler::equal, done);
5027 
5028   // Flip float_sign_flip to get max integer value.
5029   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5030   pxor(xtmp1, xtmp4);
5031 
5032   // Set detination lanes corresponding to unordered source lanes as zero.
5033   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5034   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5035 
5036   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5037   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5038   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5039 
5040   // Recompute the mask for remaining special value.
5041   pxor(xtmp2, xtmp3);
5042   // Extract mask corresponding to non-negative source lanes.
5043   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5044 
5045   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5046   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5047   pand(xtmp3, xtmp2);
5048 
5049   // Replace destination lanes holding special value(0x80000000) with max int
5050   // if corresponding source lane holds a +ve value.
5051   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5052   bind(done);
5053 }
5054 
5055 
5056 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5057                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5058   switch(to_elem_bt) {
5059     case T_SHORT:
5060       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5061       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5062       vpackusdw(dst, dst, zero, vec_enc);
5063       if (vec_enc == Assembler::AVX_256bit) {
5064         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5065       }
5066       break;
5067     case  T_BYTE:
5068       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5069       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5070       vpackusdw(dst, dst, zero, vec_enc);
5071       if (vec_enc == Assembler::AVX_256bit) {
5072         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5073       }
5074       vpackuswb(dst, dst, zero, vec_enc);
5075       break;
5076     default: assert(false, "Unexpected basic type for target of vector cast int to subword: %s", type2name(to_elem_bt));
5077   }
5078 }
5079 
5080 /*
5081  * Algorithm for vector D2L and F2I conversions (AVX 10.2 unsupported):-
5082  * a) Perform vector D2L/F2I cast.
5083  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5084  *    It signifies that source value could be any of the special floating point
5085  *    values(NaN,-Inf,Inf,Max,-Min).
5086  * c) Set destination to zero if source is NaN value.
5087  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5088  */
5089 
5090 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5091                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5092                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5093   int to_elem_sz = type2aelembytes(to_elem_bt);
5094   assert(to_elem_sz <= 4, "");
5095   vcvttps2dq(dst, src, vec_enc);
5096   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5097   if (to_elem_sz < 4) {
5098     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5099     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5100   }
5101 }
5102 
5103 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5104                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5105                                             Register rscratch, int vec_enc) {
5106   int to_elem_sz = type2aelembytes(to_elem_bt);
5107   assert(to_elem_sz <= 4, "");
5108   vcvttps2dq(dst, src, vec_enc);
5109   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5110   switch(to_elem_bt) {
5111     case T_INT:
5112       break;
5113     case T_SHORT:
5114       evpmovdw(dst, dst, vec_enc);
5115       break;
5116     case T_BYTE:
5117       evpmovdb(dst, dst, vec_enc);
5118       break;
5119     default: assert(false, "Unexpected basic type for target of vector castF2X EVEX: %s", type2name(to_elem_bt));
5120   }
5121 }
5122 
5123 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5124                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5125                                             Register rscratch, int vec_enc) {
5126   evcvttps2qq(dst, src, vec_enc);
5127   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5128 }
5129 
5130 // Handling for downcasting from double to integer or sub-word types on AVX2.
5131 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5132                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5133                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5134   int to_elem_sz = type2aelembytes(to_elem_bt);
5135   assert(to_elem_sz < 8, "");
5136   vcvttpd2dq(dst, src, vec_enc);
5137   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5138                                               float_sign_flip, vec_enc);
5139   if (to_elem_sz < 4) {
5140     // xtmp4 holds all zero lanes.
5141     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5142   }
5143 }
5144 
5145 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5146                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5147                                             KRegister ktmp2, AddressLiteral sign_flip,
5148                                             Register rscratch, int vec_enc) {
5149   if (VM_Version::supports_avx512dq()) {
5150     evcvttpd2qq(dst, src, vec_enc);
5151     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5152     switch(to_elem_bt) {
5153       case T_LONG:
5154         break;
5155       case T_INT:
5156         evpmovsqd(dst, dst, vec_enc);
5157         break;
5158       case T_SHORT:
5159         evpmovsqd(dst, dst, vec_enc);
5160         evpmovdw(dst, dst, vec_enc);
5161         break;
5162       case T_BYTE:
5163         evpmovsqd(dst, dst, vec_enc);
5164         evpmovdb(dst, dst, vec_enc);
5165         break;
5166       default: assert(false, "Unexpected basic type for target of vector castD2X AVX512DQ EVEX: %s", type2name(to_elem_bt));
5167     }
5168   } else {
5169     assert(type2aelembytes(to_elem_bt) <= 4, "");
5170     vcvttpd2dq(dst, src, vec_enc);
5171     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5172     switch(to_elem_bt) {
5173       case T_INT:
5174         break;
5175       case T_SHORT:
5176         evpmovdw(dst, dst, vec_enc);
5177         break;
5178       case T_BYTE:
5179         evpmovdb(dst, dst, vec_enc);
5180         break;
5181       default: assert(false, "Unexpected basic type for target of vector castD2X EVEX: %s", type2name(to_elem_bt));
5182     }
5183   }
5184 }
5185 
5186 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5187   switch(to_elem_bt) {
5188     case T_LONG:
5189       evcvttps2qqs(dst, src, vec_enc);
5190       break;
5191     case T_INT:
5192       evcvttps2dqs(dst, src, vec_enc);
5193       break;
5194     case T_SHORT:
5195       evcvttps2dqs(dst, src, vec_enc);
5196       evpmovdw(dst, dst, vec_enc);
5197       break;
5198     case T_BYTE:
5199       evcvttps2dqs(dst, src, vec_enc);
5200       evpmovdb(dst, dst, vec_enc);
5201       break;
5202     default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (reg src): %s", type2name(to_elem_bt));
5203   }
5204 }
5205 
5206 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5207   switch(to_elem_bt) {
5208     case T_LONG:
5209       evcvttps2qqs(dst, src, vec_enc);
5210       break;
5211     case T_INT:
5212       evcvttps2dqs(dst, src, vec_enc);
5213       break;
5214     case T_SHORT:
5215       evcvttps2dqs(dst, src, vec_enc);
5216       evpmovdw(dst, dst, vec_enc);
5217       break;
5218     case T_BYTE:
5219       evcvttps2dqs(dst, src, vec_enc);
5220       evpmovdb(dst, dst, vec_enc);
5221       break;
5222     default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (mem src): %s", type2name(to_elem_bt));
5223   }
5224 }
5225 
5226 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5227   switch(to_elem_bt) {
5228     case T_LONG:
5229       evcvttpd2qqs(dst, src, vec_enc);
5230       break;
5231     case T_INT:
5232       evcvttpd2dqs(dst, src, vec_enc);
5233       break;
5234     case T_SHORT:
5235       evcvttpd2dqs(dst, src, vec_enc);
5236       evpmovdw(dst, dst, vec_enc);
5237       break;
5238     case T_BYTE:
5239       evcvttpd2dqs(dst, src, vec_enc);
5240       evpmovdb(dst, dst, vec_enc);
5241       break;
5242     default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (reg src): %s", type2name(to_elem_bt));
5243   }
5244 }
5245 
5246 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5247   switch(to_elem_bt) {
5248     case T_LONG:
5249       evcvttpd2qqs(dst, src, vec_enc);
5250       break;
5251     case T_INT:
5252       evcvttpd2dqs(dst, src, vec_enc);
5253       break;
5254     case T_SHORT:
5255       evcvttpd2dqs(dst, src, vec_enc);
5256       evpmovdw(dst, dst, vec_enc);
5257       break;
5258     case T_BYTE:
5259       evcvttpd2dqs(dst, src, vec_enc);
5260       evpmovdb(dst, dst, vec_enc);
5261       break;
5262     default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (mem src): %s", type2name(to_elem_bt));
5263   }
5264 }
5265 
5266 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5267                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5268                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5269   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5270   // and re-instantiate original MXCSR.RC mode after that.
5271   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5272 
5273   mov64(tmp, julong_cast(0.5L));
5274   evpbroadcastq(xtmp1, tmp, vec_enc);
5275   vaddpd(xtmp1, src , xtmp1, vec_enc);
5276   evcvtpd2qq(dst, xtmp1, vec_enc);
5277   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5278                                                 double_sign_flip, vec_enc);;
5279 
5280   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5281 }
5282 
5283 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5284                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5285                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5286   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5287   // and re-instantiate original MXCSR.RC mode after that.
5288   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5289 
5290   movl(tmp, jint_cast(0.5));
5291   movq(xtmp1, tmp);
5292   vbroadcastss(xtmp1, xtmp1, vec_enc);
5293   vaddps(xtmp1, src , xtmp1, vec_enc);
5294   vcvtps2dq(dst, xtmp1, vec_enc);
5295   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5296                                               float_sign_flip, vec_enc);
5297 
5298   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5299 }
5300 
5301 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5302                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5303                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5304   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5305   // and re-instantiate original MXCSR.RC mode after that.
5306   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5307 
5308   movl(tmp, jint_cast(0.5));
5309   movq(xtmp1, tmp);
5310   vbroadcastss(xtmp1, xtmp1, vec_enc);
5311   vaddps(xtmp1, src , xtmp1, vec_enc);
5312   vcvtps2dq(dst, xtmp1, vec_enc);
5313   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5314 
5315   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5316 }
5317 
5318 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5319                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5320   switch (from_elem_bt) {
5321     case T_BYTE:
5322       switch (to_elem_bt) {
5323         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5324         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5325         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5326         default: ShouldNotReachHere();
5327       }
5328       break;
5329     case T_SHORT:
5330       switch (to_elem_bt) {
5331         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5332         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5333         default: ShouldNotReachHere();
5334       }
5335       break;
5336     case T_INT:
5337       assert(to_elem_bt == T_LONG, "");
5338       vpmovzxdq(dst, src, vlen_enc);
5339       break;
5340     default:
5341       ShouldNotReachHere();
5342   }
5343 }
5344 
5345 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5346                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5347   switch (from_elem_bt) {
5348     case T_BYTE:
5349       switch (to_elem_bt) {
5350         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5351         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5352         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5353         default: ShouldNotReachHere();
5354       }
5355       break;
5356     case T_SHORT:
5357       switch (to_elem_bt) {
5358         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5359         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5360         default: ShouldNotReachHere();
5361       }
5362       break;
5363     case T_INT:
5364       assert(to_elem_bt == T_LONG, "");
5365       vpmovsxdq(dst, src, vlen_enc);
5366       break;
5367     default:
5368       ShouldNotReachHere();
5369   }
5370 }
5371 
5372 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5373                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5374   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5375   assert(vlen_enc != AVX_512bit, "");
5376 
5377   int dst_bt_size = type2aelembytes(dst_bt);
5378   int src_bt_size = type2aelembytes(src_bt);
5379   if (dst_bt_size > src_bt_size) {
5380     switch (dst_bt_size / src_bt_size) {
5381       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5382       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5383       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5384       default: ShouldNotReachHere();
5385     }
5386   } else {
5387     assert(dst_bt_size < src_bt_size, "");
5388     switch (src_bt_size / dst_bt_size) {
5389       case 2: {
5390         if (vlen_enc == AVX_128bit) {
5391           vpacksswb(dst, src, src, vlen_enc);
5392         } else {
5393           vpacksswb(dst, src, src, vlen_enc);
5394           vpermq(dst, dst, 0x08, vlen_enc);
5395         }
5396         break;
5397       }
5398       case 4: {
5399         if (vlen_enc == AVX_128bit) {
5400           vpackssdw(dst, src, src, vlen_enc);
5401           vpacksswb(dst, dst, dst, vlen_enc);
5402         } else {
5403           vpackssdw(dst, src, src, vlen_enc);
5404           vpermq(dst, dst, 0x08, vlen_enc);
5405           vpacksswb(dst, dst, dst, AVX_128bit);
5406         }
5407         break;
5408       }
5409       case 8: {
5410         if (vlen_enc == AVX_128bit) {
5411           vpshufd(dst, src, 0x08, vlen_enc);
5412           vpackssdw(dst, dst, dst, vlen_enc);
5413           vpacksswb(dst, dst, dst, vlen_enc);
5414         } else {
5415           vpshufd(dst, src, 0x08, vlen_enc);
5416           vpermq(dst, dst, 0x08, vlen_enc);
5417           vpackssdw(dst, dst, dst, AVX_128bit);
5418           vpacksswb(dst, dst, dst, AVX_128bit);
5419         }
5420         break;
5421       }
5422       default: ShouldNotReachHere();
5423     }
5424   }
5425 }
5426 
5427 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5428                                    bool merge, BasicType bt, int vlen_enc) {
5429   if (bt == T_INT) {
5430     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5431   } else {
5432     assert(bt == T_LONG, "");
5433     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5434   }
5435 }
5436 
5437 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5438                                    bool merge, BasicType bt, int vlen_enc) {
5439   if (bt == T_INT) {
5440     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5441   } else {
5442     assert(bt == T_LONG, "");
5443     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5444   }
5445 }
5446 
5447 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5448                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5449                                                int vec_enc) {
5450   int index = 0;
5451   int vindex = 0;
5452   mov64(rtmp1, 0x0101010101010101L);
5453   pdepq(rtmp1, src, rtmp1);
5454   if (mask_len > 8) {
5455     movq(rtmp2, src);
5456     vpxor(xtmp, xtmp, xtmp, vec_enc);
5457     movq(xtmp, rtmp1);
5458   }
5459   movq(dst, rtmp1);
5460 
5461   mask_len -= 8;
5462   while (mask_len > 0) {
5463     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5464     index++;
5465     if ((index % 2) == 0) {
5466       pxor(xtmp, xtmp);
5467     }
5468     mov64(rtmp1, 0x0101010101010101L);
5469     shrq(rtmp2, 8);
5470     pdepq(rtmp1, rtmp2, rtmp1);
5471     pinsrq(xtmp, rtmp1, index % 2);
5472     vindex = index / 2;
5473     if (vindex) {
5474       // Write entire 16 byte vector when both 64 bit
5475       // lanes are update to save redundant instructions.
5476       if (index % 2) {
5477         vinsertf128(dst, dst, xtmp, vindex);
5478       }
5479     } else {
5480       vmovdqu(dst, xtmp);
5481     }
5482     mask_len -= 8;
5483   }
5484 }
5485 
5486 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5487   switch(opc) {
5488     case Op_VectorMaskTrueCount:
5489       popcntq(dst, tmp);
5490       break;
5491     case Op_VectorMaskLastTrue:
5492       if (VM_Version::supports_lzcnt()) {
5493         lzcntq(tmp, tmp);
5494         movl(dst, 63);
5495         subl(dst, tmp);
5496       } else {
5497         movl(dst, -1);
5498         bsrq(tmp, tmp);
5499         cmov32(Assembler::notZero, dst, tmp);
5500       }
5501       break;
5502     case Op_VectorMaskFirstTrue:
5503       if (VM_Version::supports_bmi1()) {
5504         if (masklen < 32) {
5505           orl(tmp, 1 << masklen);
5506           tzcntl(dst, tmp);
5507         } else if (masklen == 32) {
5508           tzcntl(dst, tmp);
5509         } else {
5510           assert(masklen == 64, "");
5511           tzcntq(dst, tmp);
5512         }
5513       } else {
5514         if (masklen < 32) {
5515           orl(tmp, 1 << masklen);
5516           bsfl(dst, tmp);
5517         } else {
5518           assert(masklen == 32 || masklen == 64, "");
5519           movl(dst, masklen);
5520           if (masklen == 32)  {
5521             bsfl(tmp, tmp);
5522           } else {
5523             bsfq(tmp, tmp);
5524           }
5525           cmov32(Assembler::notZero, dst, tmp);
5526         }
5527       }
5528       break;
5529     case Op_VectorMaskToLong:
5530       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5531       break;
5532     default: assert(false, "Unhandled mask operation");
5533   }
5534 }
5535 
5536 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5537                                               int masklen, int masksize, int vec_enc) {
5538   assert(VM_Version::supports_popcnt(), "");
5539 
5540   if(VM_Version::supports_avx512bw()) {
5541     kmovql(tmp, mask);
5542   } else {
5543     assert(masklen <= 16, "");
5544     kmovwl(tmp, mask);
5545   }
5546 
5547   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5548   // operations needs to be clipped.
5549   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5550     andq(tmp, (1 << masklen) - 1);
5551   }
5552 
5553   vector_mask_operation_helper(opc, dst, tmp, masklen);
5554 }
5555 
5556 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5557                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5558   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5559          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5560   assert(VM_Version::supports_popcnt(), "");
5561 
5562   bool need_clip = false;
5563   switch(bt) {
5564     case T_BOOLEAN:
5565       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5566       vpxor(xtmp, xtmp, xtmp, vec_enc);
5567       vpsubb(xtmp, xtmp, mask, vec_enc);
5568       vpmovmskb(tmp, xtmp, vec_enc);
5569       need_clip = masklen < 16;
5570       break;
5571     case T_BYTE:
5572       vpmovmskb(tmp, mask, vec_enc);
5573       need_clip = masklen < 16;
5574       break;
5575     case T_SHORT:
5576       vpacksswb(xtmp, mask, mask, vec_enc);
5577       if (masklen >= 16) {
5578         vpermpd(xtmp, xtmp, 8, vec_enc);
5579       }
5580       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5581       need_clip = masklen < 16;
5582       break;
5583     case T_INT:
5584     case T_FLOAT:
5585       vmovmskps(tmp, mask, vec_enc);
5586       need_clip = masklen < 4;
5587       break;
5588     case T_LONG:
5589     case T_DOUBLE:
5590       vmovmskpd(tmp, mask, vec_enc);
5591       need_clip = masklen < 2;
5592       break;
5593     default: assert(false, "Unhandled type, %s", type2name(bt));
5594   }
5595 
5596   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5597   // operations needs to be clipped.
5598   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5599     // need_clip implies masklen < 32
5600     andq(tmp, (1 << masklen) - 1);
5601   }
5602 
5603   vector_mask_operation_helper(opc, dst, tmp, masklen);
5604 }
5605 
5606 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5607                                              Register rtmp2, int mask_len) {
5608   kmov(rtmp1, src);
5609   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5610   mov64(rtmp2, -1L);
5611   pextq(rtmp2, rtmp2, rtmp1);
5612   kmov(dst, rtmp2);
5613 }
5614 
5615 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5616                                                     XMMRegister mask, Register rtmp, Register rscratch,
5617                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5618                                                     int vec_enc) {
5619   assert(type2aelembytes(bt) >= 4, "");
5620   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5621   address compress_perm_table = nullptr;
5622   address expand_perm_table = nullptr;
5623   if (type2aelembytes(bt) == 8) {
5624     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5625     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5626     vmovmskpd(rtmp, mask, vec_enc);
5627   } else {
5628     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5629     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5630     vmovmskps(rtmp, mask, vec_enc);
5631   }
5632   shlq(rtmp, 5); // for 32 byte permute row.
5633   if (opcode == Op_CompressV) {
5634     lea(rscratch, ExternalAddress(compress_perm_table));
5635   } else {
5636     lea(rscratch, ExternalAddress(expand_perm_table));
5637   }
5638   addptr(rtmp, rscratch);
5639   vmovdqu(permv, Address(rtmp));
5640   vpermps(dst, permv, src, Assembler::AVX_256bit);
5641   vpxor(xtmp, xtmp, xtmp, vec_enc);
5642   // Blend the result with zero vector using permute mask, each column entry
5643   // in a permute table row contains either a valid permute index or a -1 (default)
5644   // value, this can potentially be used as a blending mask after
5645   // compressing/expanding the source vector lanes.
5646   vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv);
5647 }
5648 
5649 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5650                                                bool merge, BasicType bt, int vec_enc) {
5651   if (opcode == Op_CompressV) {
5652     switch(bt) {
5653     case T_BYTE:
5654       evpcompressb(dst, mask, src, merge, vec_enc);
5655       break;
5656     case T_CHAR:
5657     case T_SHORT:
5658       evpcompressw(dst, mask, src, merge, vec_enc);
5659       break;
5660     case T_INT:
5661       evpcompressd(dst, mask, src, merge, vec_enc);
5662       break;
5663     case T_FLOAT:
5664       evcompressps(dst, mask, src, merge, vec_enc);
5665       break;
5666     case T_LONG:
5667       evpcompressq(dst, mask, src, merge, vec_enc);
5668       break;
5669     case T_DOUBLE:
5670       evcompresspd(dst, mask, src, merge, vec_enc);
5671       break;
5672     default:
5673       fatal("Unsupported type %s", type2name(bt));
5674       break;
5675     }
5676   } else {
5677     assert(opcode == Op_ExpandV, "");
5678     switch(bt) {
5679     case T_BYTE:
5680       evpexpandb(dst, mask, src, merge, vec_enc);
5681       break;
5682     case T_CHAR:
5683     case T_SHORT:
5684       evpexpandw(dst, mask, src, merge, vec_enc);
5685       break;
5686     case T_INT:
5687       evpexpandd(dst, mask, src, merge, vec_enc);
5688       break;
5689     case T_FLOAT:
5690       evexpandps(dst, mask, src, merge, vec_enc);
5691       break;
5692     case T_LONG:
5693       evpexpandq(dst, mask, src, merge, vec_enc);
5694       break;
5695     case T_DOUBLE:
5696       evexpandpd(dst, mask, src, merge, vec_enc);
5697       break;
5698     default:
5699       fatal("Unsupported type %s", type2name(bt));
5700       break;
5701     }
5702   }
5703 }
5704 
5705 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5706                                            KRegister ktmp1, int vec_enc) {
5707   if (opcode == Op_SignumVD) {
5708     vsubpd(dst, zero, one, vec_enc);
5709     // if src < 0 ? -1 : 1
5710     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5711     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5712     // if src == NaN, -0.0 or 0.0 return src.
5713     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5714     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5715   } else {
5716     assert(opcode == Op_SignumVF, "");
5717     vsubps(dst, zero, one, vec_enc);
5718     // if src < 0 ? -1 : 1
5719     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5720     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5721     // if src == NaN, -0.0 or 0.0 return src.
5722     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5723     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5724   }
5725 }
5726 
5727 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5728                                           XMMRegister xtmp1, int vec_enc) {
5729   if (opcode == Op_SignumVD) {
5730     vsubpd(dst, zero, one, vec_enc);
5731     // if src < 0 ? -1 : 1
5732     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5733     // if src == NaN, -0.0 or 0.0 return src.
5734     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5735     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5736   } else {
5737     assert(opcode == Op_SignumVF, "");
5738     vsubps(dst, zero, one, vec_enc);
5739     // if src < 0 ? -1 : 1
5740     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5741     // if src == NaN, -0.0 or 0.0 return src.
5742     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5743     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5744   }
5745 }
5746 
5747 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5748   if (VM_Version::supports_avx512bw()) {
5749     if (mask_len > 32) {
5750       kmovql(dst, src);
5751     } else {
5752       kmovdl(dst, src);
5753       if (mask_len != 32) {
5754         kshiftrdl(dst, dst, 32 - mask_len);
5755       }
5756     }
5757   } else {
5758     assert(mask_len <= 16, "");
5759     kmovwl(dst, src);
5760     if (mask_len != 16) {
5761       kshiftrwl(dst, dst, 16 - mask_len);
5762     }
5763   }
5764 }
5765 
5766 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5767   int lane_size = type2aelembytes(bt);
5768   if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5769       (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) {
5770     movptr(rtmp, imm32);
5771     switch(lane_size) {
5772       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5773       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5774       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5775       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5776       fatal("Unsupported lane size %d", lane_size);
5777       break;
5778     }
5779   } else {
5780     movptr(rtmp, imm32);
5781     movq(dst, rtmp);
5782     switch(lane_size) {
5783       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5784       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5785       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5786       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5787       fatal("Unsupported lane size %d", lane_size);
5788       break;
5789     }
5790   }
5791 }
5792 
5793 //
5794 // Following is lookup table based popcount computation algorithm:-
5795 //       Index   Bit set count
5796 //     [ 0000 ->   0,
5797 //       0001 ->   1,
5798 //       0010 ->   1,
5799 //       0011 ->   2,
5800 //       0100 ->   1,
5801 //       0101 ->   2,
5802 //       0110 ->   2,
5803 //       0111 ->   3,
5804 //       1000 ->   1,
5805 //       1001 ->   2,
5806 //       1010 ->   3,
5807 //       1011 ->   3,
5808 //       1100 ->   2,
5809 //       1101 ->   3,
5810 //       1111 ->   4 ]
5811 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5812 //     shuffle indices for lookup table access.
5813 //  b. Right shift each byte of vector lane by 4 positions.
5814 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5815 //     shuffle indices for lookup table access.
5816 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5817 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5818 //     count of all the bytes of a quadword.
5819 //  f. Perform step e. for upper 128bit vector lane.
5820 //  g. Pack the bitset count of quadwords back to double word.
5821 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5822 
5823 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5824                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5825   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5826   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5827   vpsrlw(dst, src, 4, vec_enc);
5828   vpand(dst, dst, xtmp1, vec_enc);
5829   vpand(xtmp1, src, xtmp1, vec_enc);
5830   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5831   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5832   vpshufb(dst, xtmp2, dst, vec_enc);
5833   vpaddb(dst, dst, xtmp1, vec_enc);
5834 }
5835 
5836 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5837                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5838   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5839   // Following code is as per steps e,f,g and h of above algorithm.
5840   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5841   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5842   vpsadbw(dst, dst, xtmp2, vec_enc);
5843   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5844   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5845   vpackuswb(dst, xtmp1, dst, vec_enc);
5846 }
5847 
5848 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5849                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5850   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5851   // Add the popcount of upper and lower bytes of word.
5852   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5853   vpsrlw(dst, xtmp1, 8, vec_enc);
5854   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5855   vpaddw(dst, dst, xtmp1, vec_enc);
5856 }
5857 
5858 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5859                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5860   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5861   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5862   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5863 }
5864 
5865 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5866                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5867   switch(bt) {
5868     case T_LONG:
5869       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5870       break;
5871     case T_INT:
5872       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5873       break;
5874     case T_CHAR:
5875     case T_SHORT:
5876       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5877       break;
5878     case T_BYTE:
5879     case T_BOOLEAN:
5880       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5881       break;
5882     default:
5883       fatal("Unsupported type %s", type2name(bt));
5884       break;
5885   }
5886 }
5887 
5888 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5889                                                       KRegister mask, bool merge, int vec_enc) {
5890   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5891   switch(bt) {
5892     case T_LONG:
5893       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5894       evpopcntq(dst, mask, src, merge, vec_enc);
5895       break;
5896     case T_INT:
5897       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5898       evpopcntd(dst, mask, src, merge, vec_enc);
5899       break;
5900     case T_CHAR:
5901     case T_SHORT:
5902       assert(VM_Version::supports_avx512_bitalg(), "");
5903       evpopcntw(dst, mask, src, merge, vec_enc);
5904       break;
5905     case T_BYTE:
5906     case T_BOOLEAN:
5907       assert(VM_Version::supports_avx512_bitalg(), "");
5908       evpopcntb(dst, mask, src, merge, vec_enc);
5909       break;
5910     default:
5911       fatal("Unsupported type %s", type2name(bt));
5912       break;
5913   }
5914 }
5915 
5916 // Bit reversal algorithm first reverses the bits of each byte followed by
5917 // a byte level reversal for multi-byte primitive types (short/int/long).
5918 // Algorithm performs a lookup table access to get reverse bit sequence
5919 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5920 // is obtained by swapping the reverse bit sequences of upper and lower
5921 // nibble of a byte.
5922 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5923                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5924   if (VM_Version::supports_avx512vlbw()) {
5925 
5926     // Get the reverse bit sequence of lower nibble of each byte.
5927     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5928     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5929     evpandq(dst, xtmp2, src, vec_enc);
5930     vpshufb(dst, xtmp1, dst, vec_enc);
5931     vpsllq(dst, dst, 4, vec_enc);
5932 
5933     // Get the reverse bit sequence of upper nibble of each byte.
5934     vpandn(xtmp2, xtmp2, src, vec_enc);
5935     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5936     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5937 
5938     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5939     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5940     evporq(xtmp2, dst, xtmp2, vec_enc);
5941     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5942 
5943   } else if(vec_enc == Assembler::AVX_512bit) {
5944     // Shift based bit reversal.
5945     assert(bt == T_LONG || bt == T_INT, "");
5946 
5947     // Swap lower and upper nibble of each byte.
5948     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5949 
5950     // Swap two least and most significant bits of each nibble.
5951     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5952 
5953     // Swap adjacent pair of bits.
5954     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5955     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5956 
5957     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5958     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5959   } else {
5960     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5961     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5962 
5963     // Get the reverse bit sequence of lower nibble of each byte.
5964     vpand(dst, xtmp2, src, vec_enc);
5965     vpshufb(dst, xtmp1, dst, vec_enc);
5966     vpsllq(dst, dst, 4, vec_enc);
5967 
5968     // Get the reverse bit sequence of upper nibble of each byte.
5969     vpandn(xtmp2, xtmp2, src, vec_enc);
5970     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5971     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5972 
5973     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5974     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5975     vpor(xtmp2, dst, xtmp2, vec_enc);
5976     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5977   }
5978 }
5979 
5980 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5981                                                 XMMRegister xtmp, Register rscratch) {
5982   assert(VM_Version::supports_gfni(), "");
5983   assert(rscratch != noreg || always_reachable(mask), "missing");
5984 
5985   // Galois field instruction based bit reversal based on following algorithm.
5986   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5987   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5988   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5989   vector_reverse_byte(bt, dst, xtmp, vec_enc);
5990 }
5991 
5992 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5993                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
5994   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5995   evpandq(dst, xtmp1, src, vec_enc);
5996   vpsllq(dst, dst, nbits, vec_enc);
5997   vpandn(xtmp1, xtmp1, src, vec_enc);
5998   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
5999   evporq(dst, dst, xtmp1, vec_enc);
6000 }
6001 
6002 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6003                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
6004   // Shift based bit reversal.
6005   assert(VM_Version::supports_evex(), "");
6006   switch(bt) {
6007     case T_LONG:
6008       // Swap upper and lower double word of each quad word.
6009       evprorq(xtmp1, k0, src, 32, true, vec_enc);
6010       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
6011       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6012       break;
6013     case T_INT:
6014       // Swap upper and lower word of each double word.
6015       evprord(xtmp1, k0, src, 16, true, vec_enc);
6016       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6017       break;
6018     case T_CHAR:
6019     case T_SHORT:
6020       // Swap upper and lower byte of each word.
6021       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6022       break;
6023     case T_BYTE:
6024       evmovdquq(dst, k0, src, true, vec_enc);
6025       break;
6026     default:
6027       fatal("Unsupported type %s", type2name(bt));
6028       break;
6029   }
6030 }
6031 
6032 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6033   if (bt == T_BYTE) {
6034     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6035       evmovdquq(dst, k0, src, true, vec_enc);
6036     } else {
6037       vmovdqu(dst, src);
6038     }
6039     return;
6040   }
6041   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6042   // pre-computed shuffle indices.
6043   switch(bt) {
6044     case T_LONG:
6045       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6046       break;
6047     case T_INT:
6048       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6049       break;
6050     case T_CHAR:
6051     case T_SHORT:
6052       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6053       break;
6054     default:
6055       fatal("Unsupported type %s", type2name(bt));
6056       break;
6057   }
6058   vpshufb(dst, src, dst, vec_enc);
6059 }
6060 
6061 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6062                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6063                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6064   assert(is_integral_type(bt), "");
6065   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6066   assert(VM_Version::supports_avx512cd(), "");
6067   switch(bt) {
6068     case T_LONG:
6069       evplzcntq(dst, ktmp, src, merge, vec_enc);
6070       break;
6071     case T_INT:
6072       evplzcntd(dst, ktmp, src, merge, vec_enc);
6073       break;
6074     case T_SHORT:
6075       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6076       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6077       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6078       vpunpckhwd(dst, xtmp1, src, vec_enc);
6079       evplzcntd(dst, ktmp, dst, merge, vec_enc);
6080       vpackusdw(dst, xtmp2, dst, vec_enc);
6081       break;
6082     case T_BYTE:
6083       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6084       // accessing the lookup table.
6085       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6086       // accessing the lookup table.
6087       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6088       assert(VM_Version::supports_avx512bw(), "");
6089       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6090       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6091       vpand(xtmp2, dst, src, vec_enc);
6092       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6093       vpsrlw(xtmp3, src, 4, vec_enc);
6094       vpand(xtmp3, dst, xtmp3, vec_enc);
6095       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6096       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6097       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6098       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6099       break;
6100     default:
6101       fatal("Unsupported type %s", type2name(bt));
6102       break;
6103   }
6104 }
6105 
6106 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6107                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6108   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6109   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6110   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6111   // accessing the lookup table.
6112   vpand(dst, xtmp2, src, vec_enc);
6113   vpshufb(dst, xtmp1, dst, vec_enc);
6114   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6115   // accessing the lookup table.
6116   vpsrlw(xtmp3, src, 4, vec_enc);
6117   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6118   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6119   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6120   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6121   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6122   vpaddb(dst, dst, xtmp2, vec_enc);
6123   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6124 }
6125 
6126 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6127                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6128   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6129   // Add zero counts of lower byte and upper byte of a word if
6130   // upper byte holds a zero value.
6131   vpsrlw(xtmp3, src, 8, vec_enc);
6132   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6133   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6134   vpsllw(xtmp2, dst, 8, vec_enc);
6135   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6136   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6137   vpsrlw(dst, dst, 8, vec_enc);
6138 }
6139 
6140 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6141                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6142   // By converting the integer to a float, we can obtain the number of leading zeros based on the exponent of the float.
6143   // As the float exponent contains a bias of 127 for nonzero values, the bias must be removed before interpreting the
6144   // exponent as the leading zero count.
6145 
6146   // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher
6147   // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit
6148   // contributes to the leading number of zeros.
6149   vpsrld(dst, src, 1, vec_enc);
6150   vpandn(dst, dst, src, vec_enc);
6151 
6152   vcvtdq2ps(dst, dst, vec_enc);
6153 
6154   // By comparing the register to itself, all the bits in the destination are set.
6155   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6156 
6157   // Move the biased exponent to the low end of the lane and mask with 0xFF to discard the sign bit.
6158   vpsrld(xtmp2, xtmp1, 24, vec_enc);
6159   vpsrld(dst, dst, 23, vec_enc);
6160   vpand(dst, xtmp2, dst, vec_enc);
6161 
6162   // Subtract 127 from the exponent, which removes the bias from the exponent.
6163   vpsrld(xtmp2, xtmp1, 25, vec_enc);
6164   vpsubd(dst, dst, xtmp2, vec_enc);
6165 
6166   vpsrld(xtmp2, xtmp1, 27, vec_enc);
6167 
6168   // If the original value is 0 the exponent would not have bias, so the subtraction creates a negative number. If this
6169   // is found in any of the lanes, replace the lane with -1 from xtmp1.
6170   vblendvps(dst, dst, xtmp1, dst, vec_enc, true, xtmp3);
6171 
6172   // If the original value is negative, replace the lane with 31.
6173   vblendvps(dst, dst, xtmp2, src, vec_enc, true, xtmp3);
6174 
6175   // Subtract the exponent from 31, giving the final result. For 0, the result is 32 as the exponent was replaced with -1,
6176   // and for negative numbers the result is 0 as the exponent was replaced with 31.
6177   vpsubd(dst, xtmp2, dst, vec_enc);
6178 }
6179 
6180 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6181                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6182   // Find the leading zeros of the top and bottom halves of the long individually.
6183   vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6184 
6185   // Move the top half result to the bottom half of xtmp1, setting the top half to 0.
6186   vpsrlq(xtmp1, dst, 32, vec_enc);
6187   // By moving the top half result to the right by 6 bits, if the top half was empty (i.e. 32 is returned) the result bit will
6188   // be in the most significant position of the bottom half.
6189   vpsrlq(xtmp2, dst, 6, vec_enc);
6190 
6191   // In the bottom half, add the top half and bottom half results.
6192   vpaddq(dst, xtmp1, dst, vec_enc);
6193 
6194   // For the bottom half, choose between the values using the most significant bit of xtmp2.
6195   // If the MSB is set, then bottom+top in dst is the resulting value. If the top half is less than 32 xtmp1 is chosen,
6196   // which contains only the top half result.
6197   // In the top half the MSB is always zero, so the value in xtmp1 is always chosen. This value is always 0, which clears
6198   // the lane as required.
6199   vblendvps(dst, xtmp1, dst, xtmp2, vec_enc, true, xtmp3);
6200 }
6201 
6202 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6203                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6204                                                        Register rtmp, int vec_enc) {
6205   assert(is_integral_type(bt), "unexpected type");
6206   assert(vec_enc < Assembler::AVX_512bit, "");
6207   switch(bt) {
6208     case T_LONG:
6209       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6210       break;
6211     case T_INT:
6212       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6213       break;
6214     case T_SHORT:
6215       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6216       break;
6217     case T_BYTE:
6218       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6219       break;
6220     default:
6221       fatal("Unsupported type %s", type2name(bt));
6222       break;
6223   }
6224 }
6225 
6226 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6227   switch(bt) {
6228     case T_BYTE:
6229       vpsubb(dst, src1, src2, vec_enc);
6230       break;
6231     case T_SHORT:
6232       vpsubw(dst, src1, src2, vec_enc);
6233       break;
6234     case T_INT:
6235       vpsubd(dst, src1, src2, vec_enc);
6236       break;
6237     case T_LONG:
6238       vpsubq(dst, src1, src2, vec_enc);
6239       break;
6240     default:
6241       fatal("Unsupported type %s", type2name(bt));
6242       break;
6243   }
6244 }
6245 
6246 // Trailing zero count computation is based on leading zero count operation as per
6247 // following equation. All AVX3 targets support AVX512CD feature which offers
6248 // direct vector instruction to compute leading zero count.
6249 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6250 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6251                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6252                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6253   assert(is_integral_type(bt), "");
6254   // xtmp = -1
6255   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6256   // xtmp = xtmp + src
6257   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6258   // xtmp = xtmp & ~src
6259   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6260   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6261   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6262   vpsub(bt, dst, xtmp4, dst, vec_enc);
6263 }
6264 
6265 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6266 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6267 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6268                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6269   assert(is_integral_type(bt), "");
6270   // xtmp = 0
6271   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6272   // xtmp = 0 - src
6273   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6274   // xtmp = xtmp | src
6275   vpor(xtmp3, xtmp3, src, vec_enc);
6276   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6277   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6278   vpsub(bt, dst, xtmp1, dst, vec_enc);
6279 }
6280 
6281 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6282   Label done;
6283   Label neg_divisor_fastpath;
6284   cmpl(divisor, 0);
6285   jccb(Assembler::less, neg_divisor_fastpath);
6286   xorl(rdx, rdx);
6287   divl(divisor);
6288   jmpb(done);
6289   bind(neg_divisor_fastpath);
6290   // Fastpath for divisor < 0:
6291   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6292   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6293   movl(rdx, rax);
6294   subl(rdx, divisor);
6295   if (VM_Version::supports_bmi1()) {
6296     andnl(rax, rdx, rax);
6297   } else {
6298     notl(rdx);
6299     andl(rax, rdx);
6300   }
6301   shrl(rax, 31);
6302   bind(done);
6303 }
6304 
6305 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6306   Label done;
6307   Label neg_divisor_fastpath;
6308   cmpl(divisor, 0);
6309   jccb(Assembler::less, neg_divisor_fastpath);
6310   xorl(rdx, rdx);
6311   divl(divisor);
6312   jmpb(done);
6313   bind(neg_divisor_fastpath);
6314   // Fastpath when divisor < 0:
6315   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6316   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6317   movl(rdx, rax);
6318   subl(rax, divisor);
6319   if (VM_Version::supports_bmi1()) {
6320     andnl(rax, rax, rdx);
6321   } else {
6322     notl(rax);
6323     andl(rax, rdx);
6324   }
6325   sarl(rax, 31);
6326   andl(rax, divisor);
6327   subl(rdx, rax);
6328   bind(done);
6329 }
6330 
6331 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6332   Label done;
6333   Label neg_divisor_fastpath;
6334 
6335   cmpl(divisor, 0);
6336   jccb(Assembler::less, neg_divisor_fastpath);
6337   xorl(rdx, rdx);
6338   divl(divisor);
6339   jmpb(done);
6340   bind(neg_divisor_fastpath);
6341   // Fastpath for divisor < 0:
6342   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6343   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6344   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6345   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6346   movl(rdx, rax);
6347   subl(rax, divisor);
6348   if (VM_Version::supports_bmi1()) {
6349     andnl(rax, rax, rdx);
6350   } else {
6351     notl(rax);
6352     andl(rax, rdx);
6353   }
6354   movl(tmp, rax);
6355   shrl(rax, 31); // quotient
6356   sarl(tmp, 31);
6357   andl(tmp, divisor);
6358   subl(rdx, tmp); // remainder
6359   bind(done);
6360 }
6361 
6362 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6363                                  XMMRegister xtmp2, Register rtmp) {
6364   if(VM_Version::supports_gfni()) {
6365     // Galois field instruction based bit reversal based on following algorithm.
6366     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6367     mov64(rtmp, 0x8040201008040201L);
6368     movq(xtmp1, src);
6369     movq(xtmp2, rtmp);
6370     gf2p8affineqb(xtmp1, xtmp2, 0);
6371     movq(dst, xtmp1);
6372   } else {
6373     // Swap even and odd numbered bits.
6374     movl(rtmp, src);
6375     andl(rtmp, 0x55555555);
6376     shll(rtmp, 1);
6377     movl(dst, src);
6378     andl(dst, 0xAAAAAAAA);
6379     shrl(dst, 1);
6380     orl(dst, rtmp);
6381 
6382     // Swap LSB and MSB 2 bits of each nibble.
6383     movl(rtmp, dst);
6384     andl(rtmp, 0x33333333);
6385     shll(rtmp, 2);
6386     andl(dst, 0xCCCCCCCC);
6387     shrl(dst, 2);
6388     orl(dst, rtmp);
6389 
6390     // Swap LSB and MSB 4 bits of each byte.
6391     movl(rtmp, dst);
6392     andl(rtmp, 0x0F0F0F0F);
6393     shll(rtmp, 4);
6394     andl(dst, 0xF0F0F0F0);
6395     shrl(dst, 4);
6396     orl(dst, rtmp);
6397   }
6398   bswapl(dst);
6399 }
6400 
6401 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6402                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6403   if(VM_Version::supports_gfni()) {
6404     // Galois field instruction based bit reversal based on following algorithm.
6405     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6406     mov64(rtmp1, 0x8040201008040201L);
6407     movq(xtmp1, src);
6408     movq(xtmp2, rtmp1);
6409     gf2p8affineqb(xtmp1, xtmp2, 0);
6410     movq(dst, xtmp1);
6411   } else {
6412     // Swap even and odd numbered bits.
6413     movq(rtmp1, src);
6414     mov64(rtmp2, 0x5555555555555555L);
6415     andq(rtmp1, rtmp2);
6416     shlq(rtmp1, 1);
6417     movq(dst, src);
6418     notq(rtmp2);
6419     andq(dst, rtmp2);
6420     shrq(dst, 1);
6421     orq(dst, rtmp1);
6422 
6423     // Swap LSB and MSB 2 bits of each nibble.
6424     movq(rtmp1, dst);
6425     mov64(rtmp2, 0x3333333333333333L);
6426     andq(rtmp1, rtmp2);
6427     shlq(rtmp1, 2);
6428     notq(rtmp2);
6429     andq(dst, rtmp2);
6430     shrq(dst, 2);
6431     orq(dst, rtmp1);
6432 
6433     // Swap LSB and MSB 4 bits of each byte.
6434     movq(rtmp1, dst);
6435     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6436     andq(rtmp1, rtmp2);
6437     shlq(rtmp1, 4);
6438     notq(rtmp2);
6439     andq(dst, rtmp2);
6440     shrq(dst, 4);
6441     orq(dst, rtmp1);
6442   }
6443   bswapq(dst);
6444 }
6445 
6446 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6447   Label done;
6448   Label neg_divisor_fastpath;
6449   cmpq(divisor, 0);
6450   jccb(Assembler::less, neg_divisor_fastpath);
6451   xorl(rdx, rdx);
6452   divq(divisor);
6453   jmpb(done);
6454   bind(neg_divisor_fastpath);
6455   // Fastpath for divisor < 0:
6456   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6457   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6458   movq(rdx, rax);
6459   subq(rdx, divisor);
6460   if (VM_Version::supports_bmi1()) {
6461     andnq(rax, rdx, rax);
6462   } else {
6463     notq(rdx);
6464     andq(rax, rdx);
6465   }
6466   shrq(rax, 63);
6467   bind(done);
6468 }
6469 
6470 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6471   Label done;
6472   Label neg_divisor_fastpath;
6473   cmpq(divisor, 0);
6474   jccb(Assembler::less, neg_divisor_fastpath);
6475   xorq(rdx, rdx);
6476   divq(divisor);
6477   jmp(done);
6478   bind(neg_divisor_fastpath);
6479   // Fastpath when divisor < 0:
6480   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6481   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6482   movq(rdx, rax);
6483   subq(rax, divisor);
6484   if (VM_Version::supports_bmi1()) {
6485     andnq(rax, rax, rdx);
6486   } else {
6487     notq(rax);
6488     andq(rax, rdx);
6489   }
6490   sarq(rax, 63);
6491   andq(rax, divisor);
6492   subq(rdx, rax);
6493   bind(done);
6494 }
6495 
6496 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6497   Label done;
6498   Label neg_divisor_fastpath;
6499   cmpq(divisor, 0);
6500   jccb(Assembler::less, neg_divisor_fastpath);
6501   xorq(rdx, rdx);
6502   divq(divisor);
6503   jmp(done);
6504   bind(neg_divisor_fastpath);
6505   // Fastpath for divisor < 0:
6506   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6507   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6508   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6509   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6510   movq(rdx, rax);
6511   subq(rax, divisor);
6512   if (VM_Version::supports_bmi1()) {
6513     andnq(rax, rax, rdx);
6514   } else {
6515     notq(rax);
6516     andq(rax, rdx);
6517   }
6518   movq(tmp, rax);
6519   shrq(rax, 63); // quotient
6520   sarq(tmp, 63);
6521   andq(tmp, divisor);
6522   subq(rdx, tmp); // remainder
6523   bind(done);
6524 }
6525 
6526 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6527                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6528                                         int vlen_enc) {
6529   assert(VM_Version::supports_avx512bw(), "");
6530   // Byte shuffles are inlane operations and indices are determined using
6531   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6532   // normalized to index range 0-15. This makes sure that all the multiples
6533   // of an index value are placed at same relative position in 128 bit
6534   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6535   // will be 16th element in their respective 128 bit lanes.
6536   movl(rtmp, 16);
6537   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6538 
6539   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6540   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6541   // original shuffle indices and move the shuffled lanes corresponding to true
6542   // mask to destination vector.
6543   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6544   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6545   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6546 
6547   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6548   // and broadcasting second 128 bit lane.
6549   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6550   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6551   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6552   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6553   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6554 
6555   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6556   // and broadcasting third 128 bit lane.
6557   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6558   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6559   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6560   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6561   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6562 
6563   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6564   // and broadcasting third 128 bit lane.
6565   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6566   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6567   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6568   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6569   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6570 }
6571 
6572 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6573                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6574   if (vlen_enc == AVX_128bit) {
6575     vpermilps(dst, src, shuffle, vlen_enc);
6576   } else if (bt == T_INT) {
6577     vpermd(dst, shuffle, src, vlen_enc);
6578   } else {
6579     assert(bt == T_FLOAT, "");
6580     vpermps(dst, shuffle, src, vlen_enc);
6581   }
6582 }
6583 
6584 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6585   switch(opcode) {
6586     case Op_AddHF: vaddsh(dst, src1, src2); break;
6587     case Op_SubHF: vsubsh(dst, src1, src2); break;
6588     case Op_MulHF: vmulsh(dst, src1, src2); break;
6589     case Op_DivHF: vdivsh(dst, src1, src2); break;
6590     default: assert(false, "%s", NodeClassNames[opcode]); break;
6591   }
6592 }
6593 
6594 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6595   switch(elem_bt) {
6596     case T_BYTE:
6597       if (ideal_opc == Op_SaturatingAddV) {
6598         vpaddsb(dst, src1, src2, vlen_enc);
6599       } else {
6600         assert(ideal_opc == Op_SaturatingSubV, "");
6601         vpsubsb(dst, src1, src2, vlen_enc);
6602       }
6603       break;
6604     case T_SHORT:
6605       if (ideal_opc == Op_SaturatingAddV) {
6606         vpaddsw(dst, src1, src2, vlen_enc);
6607       } else {
6608         assert(ideal_opc == Op_SaturatingSubV, "");
6609         vpsubsw(dst, src1, src2, vlen_enc);
6610       }
6611       break;
6612     default:
6613       fatal("Unsupported type %s", type2name(elem_bt));
6614       break;
6615   }
6616 }
6617 
6618 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6619   switch(elem_bt) {
6620     case T_BYTE:
6621       if (ideal_opc == Op_SaturatingAddV) {
6622         vpaddusb(dst, src1, src2, vlen_enc);
6623       } else {
6624         assert(ideal_opc == Op_SaturatingSubV, "");
6625         vpsubusb(dst, src1, src2, vlen_enc);
6626       }
6627       break;
6628     case T_SHORT:
6629       if (ideal_opc == Op_SaturatingAddV) {
6630         vpaddusw(dst, src1, src2, vlen_enc);
6631       } else {
6632         assert(ideal_opc == Op_SaturatingSubV, "");
6633         vpsubusw(dst, src1, src2, vlen_enc);
6634       }
6635       break;
6636     default:
6637       fatal("Unsupported type %s", type2name(elem_bt));
6638       break;
6639   }
6640 }
6641 
6642 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6643                                                               XMMRegister src2, KRegister ktmp, int vlen_enc) {
6644   // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6645   // overflow_mask = Inp1 <u Inp2
6646   evpcmpu(elem_bt, ktmp,  src2, src1, Assembler::lt, vlen_enc);
6647   // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6648   evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6649 }
6650 
6651 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6652                                                               XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6653   // Emulate unsigned comparison using signed comparison
6654   // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6655   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6656   vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6657   vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6658 
6659   vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6660 
6661   // Res = INP1 - INP2 (non-commutative and non-associative)
6662   vpsub(elem_bt, dst, src1, src2, vlen_enc);
6663   // Res = Mask ? Zero : Res
6664   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6665   vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6666 }
6667 
6668 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6669                                                                XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6670   // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6671   // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6672   // Res = Signed Add INP1, INP2
6673   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6674   // T1 = SRC1 | SRC2
6675   vpor(xtmp1, src1, src2, vlen_enc);
6676   // Max_Unsigned = -1
6677   vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6678   // Unsigned compare:  Mask = Res <u T1
6679   evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6680   // res  = Mask ? Max_Unsigned : Res
6681   evpblend(elem_bt, dst, ktmp,  dst, xtmp2, true, vlen_enc);
6682 }
6683 
6684 //
6685 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6686 // unsigned addition operation.
6687 //    overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6688 //
6689 // We empirically determined its semantic equivalence to following reduced expression
6690 //    overflow_mask =  (a + b) <u (a | b)
6691 //
6692 // and also verified it though Alive2 solver.
6693 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6694 //
6695 
6696 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6697                                                               XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6698   // Res = Signed Add INP1, INP2
6699   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6700   // Compute T1 = INP1 | INP2
6701   vpor(xtmp3, src1, src2, vlen_enc);
6702   // T1 = Minimum signed value.
6703   vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6704   // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6705   vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6706   // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6707   vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6708   // Compute overflow detection mask = Res<1> <s T1
6709   if (elem_bt == T_INT) {
6710     vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6711   } else {
6712     assert(elem_bt == T_LONG, "");
6713     vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6714   }
6715   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6716 }
6717 
6718 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6719                                       int vlen_enc, bool xtmp2_hold_M1) {
6720   if (VM_Version::supports_avx512dq()) {
6721     evpmovq2m(ktmp, src, vlen_enc);
6722   } else {
6723     assert(VM_Version::supports_evex(), "");
6724     if (!xtmp2_hold_M1) {
6725       vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6726     }
6727     evpsraq(xtmp1, src, 63, vlen_enc);
6728     evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6729   }
6730 }
6731 
6732 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6733                                       int vlen_enc, bool xtmp2_hold_M1) {
6734   if (VM_Version::supports_avx512dq()) {
6735     evpmovd2m(ktmp, src, vlen_enc);
6736   } else {
6737     assert(VM_Version::supports_evex(), "");
6738     if (!xtmp2_hold_M1) {
6739       vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6740     }
6741     vpsrad(xtmp1, src, 31, vlen_enc);
6742     Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6743   }
6744 }
6745 
6746 
6747 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6748   if (elem_bt == T_LONG) {
6749     if (VM_Version::supports_evex()) {
6750       evpsraq(dst, src, 63, vlen_enc);
6751     } else {
6752       vpsrad(dst, src, 31, vlen_enc);
6753       vpshufd(dst, dst, 0xF5, vlen_enc);
6754     }
6755   } else {
6756     assert(elem_bt == T_INT, "");
6757     vpsrad(dst, src, 31, vlen_enc);
6758   }
6759 }
6760 
6761 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6762   if (compute_allones) {
6763     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6764       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6765     } else {
6766       vpcmpeqq(allones, allones, allones, vlen_enc);
6767     }
6768   }
6769   if (elem_bt == T_LONG) {
6770     vpsrlq(dst, allones, 1, vlen_enc);
6771   } else {
6772     assert(elem_bt == T_INT, "");
6773     vpsrld(dst, allones, 1, vlen_enc);
6774   }
6775 }
6776 
6777 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6778   if (compute_allones) {
6779     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6780       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6781     } else {
6782       vpcmpeqq(allones, allones, allones, vlen_enc);
6783     }
6784   }
6785   if (elem_bt == T_LONG) {
6786     vpsllq(dst, allones, 63, vlen_enc);
6787   } else {
6788     assert(elem_bt == T_INT, "");
6789     vpslld(dst, allones, 31, vlen_enc);
6790   }
6791 }
6792 
6793 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask,  XMMRegister src1, XMMRegister src2,
6794                                 Assembler::ComparisonPredicate cond, int vlen_enc) {
6795   switch(elem_bt) {
6796     case T_LONG:  evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6797     case T_INT:   evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6798     case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6799     case T_BYTE:  evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6800     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6801   }
6802 }
6803 
6804 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6805   switch(elem_bt) {
6806     case  T_LONG:  vpcmpgtq(dst, src1, src2, vlen_enc); break;
6807     case  T_INT:   vpcmpgtd(dst, src1, src2, vlen_enc); break;
6808     case  T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6809     case  T_BYTE:  vpcmpgtb(dst, src1, src2, vlen_enc); break;
6810     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6811   }
6812 }
6813 
6814 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6815                                            XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6816   if (elem_bt == T_LONG) {
6817     evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6818   } else {
6819     assert(elem_bt == T_INT, "");
6820     evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6821   }
6822 }
6823 
6824 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6825                                                          XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6826                                                          KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6827   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6828   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6829   // Overflow detection based on Hacker's delight section 2-13.
6830   if (ideal_opc == Op_SaturatingAddV) {
6831     // res = src1 + src2
6832     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6833     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6834     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6835     vpxor(xtmp1, dst, src1, vlen_enc);
6836     vpxor(xtmp2, dst, src2, vlen_enc);
6837     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6838   } else {
6839     assert(ideal_opc == Op_SaturatingSubV, "");
6840     // res = src1 - src2
6841     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6842     // Overflow occurs when both inputs have opposite polarity and
6843     // result polarity does not comply with first input polarity.
6844     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6845     vpxor(xtmp1, src1, src2, vlen_enc);
6846     vpxor(xtmp2, dst, src1, vlen_enc);
6847     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6848   }
6849 
6850   // Compute overflow detection mask.
6851   evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6852   // Note: xtmp1 hold -1 in all its lanes after above call.
6853 
6854   // Compute mask based on first input polarity.
6855   evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6856 
6857   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6858   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6859 
6860   // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6861   // set bits in first input polarity mask holds a min value.
6862   evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6863   // Blend destination lanes with saturated values using overflow detection mask.
6864   evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6865 }
6866 
6867 
6868 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6869                                                         XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6870                                                         XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6871   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6872   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6873   // Overflow detection based on Hacker's delight section 2-13.
6874   if (ideal_opc == Op_SaturatingAddV) {
6875     // res = src1 + src2
6876     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6877     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6878     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6879     vpxor(xtmp1, dst, src1, vlen_enc);
6880     vpxor(xtmp2, dst, src2, vlen_enc);
6881     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6882   } else {
6883     assert(ideal_opc == Op_SaturatingSubV, "");
6884     // res = src1 - src2
6885     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6886     // Overflow occurs when both inputs have opposite polarity and
6887     // result polarity does not comply with first input polarity.
6888     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6889     vpxor(xtmp1, src1, src2, vlen_enc);
6890     vpxor(xtmp2, dst, src1, vlen_enc);
6891     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6892   }
6893 
6894   // Sign-extend to compute overflow detection mask.
6895   vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
6896 
6897   vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
6898   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
6899   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6900 
6901   // Compose saturating min/max vector using first input polarity mask.
6902   vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
6903   vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
6904 
6905   // Blend result with saturating vector using overflow detection mask.
6906   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6907 }
6908 
6909 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6910   switch(elem_bt) {
6911     case T_BYTE:
6912       if (ideal_opc == Op_SaturatingAddV) {
6913         vpaddsb(dst, src1, src2, vlen_enc);
6914       } else {
6915         assert(ideal_opc == Op_SaturatingSubV, "");
6916         vpsubsb(dst, src1, src2, vlen_enc);
6917       }
6918       break;
6919     case T_SHORT:
6920       if (ideal_opc == Op_SaturatingAddV) {
6921         vpaddsw(dst, src1, src2, vlen_enc);
6922       } else {
6923         assert(ideal_opc == Op_SaturatingSubV, "");
6924         vpsubsw(dst, src1, src2, vlen_enc);
6925       }
6926       break;
6927     default:
6928       fatal("Unsupported type %s", type2name(elem_bt));
6929       break;
6930   }
6931 }
6932 
6933 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6934   switch(elem_bt) {
6935     case T_BYTE:
6936       if (ideal_opc == Op_SaturatingAddV) {
6937         vpaddusb(dst, src1, src2, vlen_enc);
6938       } else {
6939         assert(ideal_opc == Op_SaturatingSubV, "");
6940         vpsubusb(dst, src1, src2, vlen_enc);
6941       }
6942       break;
6943     case T_SHORT:
6944       if (ideal_opc == Op_SaturatingAddV) {
6945         vpaddusw(dst, src1, src2, vlen_enc);
6946       } else {
6947         assert(ideal_opc == Op_SaturatingSubV, "");
6948         vpsubusw(dst, src1, src2, vlen_enc);
6949       }
6950       break;
6951     default:
6952       fatal("Unsupported type %s", type2name(elem_bt));
6953       break;
6954   }
6955 }
6956 
6957 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6958                                                      XMMRegister src2, int vlen_enc) {
6959   switch(elem_bt) {
6960     case T_BYTE:
6961       evpermi2b(dst, src1, src2, vlen_enc);
6962       break;
6963     case T_SHORT:
6964       evpermi2w(dst, src1, src2, vlen_enc);
6965       break;
6966     case T_INT:
6967       evpermi2d(dst, src1, src2, vlen_enc);
6968       break;
6969     case T_LONG:
6970       evpermi2q(dst, src1, src2, vlen_enc);
6971       break;
6972     case T_FLOAT:
6973       evpermi2ps(dst, src1, src2, vlen_enc);
6974       break;
6975     case T_DOUBLE:
6976       evpermi2pd(dst, src1, src2, vlen_enc);
6977       break;
6978     default:
6979       fatal("Unsupported type %s", type2name(elem_bt));
6980       break;
6981   }
6982 }
6983 
6984 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
6985   if (is_unsigned) {
6986     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6987   } else {
6988     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6989   }
6990 }
6991 
6992 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
6993   if (is_unsigned) {
6994     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6995   } else {
6996     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6997   }
6998 }
6999 
7000 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
7001   switch(opcode) {
7002     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7003     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7004     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7005     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7006     default: assert(false, "%s", NodeClassNames[opcode]); break;
7007   }
7008 }
7009 
7010 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7011   switch(opcode) {
7012     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7013     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7014     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7015     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7016     default: assert(false, "%s", NodeClassNames[opcode]); break;
7017   }
7018 }
7019 
7020 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7021                                             KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) {
7022   vector_max_min_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit);
7023 }
7024 
7025 void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7026                                             KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
7027   if (opcode == Op_MaxVHF || opcode == Op_MaxHF) {
7028     // Move sign bits of src2 to mask register.
7029     evpmovw2m(ktmp, src2, vlen_enc);
7030     // xtmp1 = src2 < 0 ? src2 : src1
7031     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7032     // xtmp2 = src2 < 0 ? ? src1 : src2
7033     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7034     // Idea behind above swapping is to make seconds source operand a +ve value.
7035     // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in
7036     // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction,
7037     // the second source operand, either a NaN or a valid floating-point value, is returned
7038     // dst = max(xtmp1, xtmp2)
7039     evmaxph(dst, xtmp1, xtmp2, vlen_enc);
7040     // isNaN = is_unordered_quiet(xtmp1)
7041     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7042     // Final result is same as first source if its a NaN value,
7043     // in case second operand holds a NaN value then as per above semantics
7044     // result is same as second operand.
7045     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7046   } else {
7047     assert(opcode == Op_MinVHF || opcode == Op_MinHF, "");
7048     // Move sign bits of src1 to mask register.
7049     evpmovw2m(ktmp, src1, vlen_enc);
7050     // xtmp1 = src1 < 0 ? src2 : src1
7051     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7052     // xtmp2 = src1 < 0 ? src1 : src2
7053     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7054     // Idea behind above swapping is to make seconds source operand a -ve value.
7055     // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in
7056     // the second source operand is returned.
7057     // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN
7058     // or a valid floating-point value, is written to the result.
7059     // dst = min(xtmp1, xtmp2)
7060     evminph(dst, xtmp1, xtmp2, vlen_enc);
7061     // isNaN = is_unordered_quiet(xtmp1)
7062     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7063     // Final result is same as first source if its a NaN value,
7064     // in case second operand holds a NaN value then as per above semantics
7065     // result is same as second operand.
7066     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7067   }
7068 }