1 /*
   2  * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "gc/shared/barrierSet.hpp"
  28 #include "gc/shared/barrierSetAssembler.hpp"
  29 #include "oops/methodData.hpp"
  30 #include "opto/c2_MacroAssembler.hpp"
  31 #include "opto/intrinsicnode.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/opcodes.hpp"
  34 #include "opto/subnode.hpp"
  35 #include "runtime/globals.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/objectMonitorTable.hpp"
  38 #include "runtime/stubRoutines.hpp"
  39 #include "runtime/synchronizer.hpp"
  40 #include "utilities/checkedCast.hpp"
  41 #include "utilities/globalDefinitions.hpp"
  42 #include "utilities/powerOfTwo.hpp"
  43 #include "utilities/sizes.hpp"
  44 
  45 #ifdef PRODUCT
  46 #define BLOCK_COMMENT(str) /* nothing */
  47 #define STOP(error) stop(error)
  48 #else
  49 #define BLOCK_COMMENT(str) block_comment(str)
  50 #define STOP(error) block_comment(error); stop(error)
  51 #endif
  52 
  53 // C2 compiled method's prolog code.
  54 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
  55   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  56 
  57   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  58   // Remove word for return addr
  59   framesize -= wordSize;
  60   stack_bang_size -= wordSize;
  61 
  62   // Calls to C2R adapters often do not accept exceptional returns.
  63   // We require that their callers must bang for them.  But be careful, because
  64   // some VM calls (such as call site linkage) can use several kilobytes of
  65   // stack.  But the stack safety zone should account for that.
  66   // See bugs 4446381, 4468289, 4497237.
  67   if (stack_bang_size > 0) {
  68     generate_stack_overflow_check(stack_bang_size);
  69 
  70     // We always push rbp, so that on return to interpreter rbp, will be
  71     // restored correctly and we can correct the stack.
  72     push(rbp);
  73     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  74     if (PreserveFramePointer) {
  75       mov(rbp, rsp);
  76     }
  77     // Remove word for ebp
  78     framesize -= wordSize;
  79 
  80     // Create frame
  81     if (framesize) {
  82       subptr(rsp, framesize);
  83     }
  84   } else {
  85     subptr(rsp, framesize);
  86 
  87     // Save RBP register now.
  88     framesize -= wordSize;
  89     movptr(Address(rsp, framesize), rbp);
  90     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  91     if (PreserveFramePointer) {
  92       movptr(rbp, rsp);
  93       if (framesize > 0) {
  94         addptr(rbp, framesize);
  95       }
  96     }
  97   }
  98 
  99   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 100     framesize -= wordSize;
 101     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 102   }
 103 
 104 #ifdef ASSERT
 105   if (VerifyStackAtCalls) {
 106     Label L;
 107     push(rax);
 108     mov(rax, rsp);
 109     andptr(rax, StackAlignmentInBytes-1);
 110     cmpptr(rax, StackAlignmentInBytes-wordSize);
 111     pop(rax);
 112     jcc(Assembler::equal, L);
 113     STOP("Stack is not properly aligned!");
 114     bind(L);
 115   }
 116 #endif
 117 
 118   if (!is_stub) {
 119     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 120     // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 121     Label dummy_slow_path;
 122     Label dummy_continuation;
 123     Label* slow_path = &dummy_slow_path;
 124     Label* continuation = &dummy_continuation;
 125     if (!Compile::current()->output()->in_scratch_emit_size()) {
 126       // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 127       C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 128       Compile::current()->output()->add_stub(stub);
 129       slow_path = &stub->entry();
 130       continuation = &stub->continuation();
 131     }
 132     bs->nmethod_entry_barrier(this, slow_path, continuation);
 133   }
 134 }
 135 
 136 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 137   switch (vlen_in_bytes) {
 138     case  4: // fall-through
 139     case  8: // fall-through
 140     case 16: return Assembler::AVX_128bit;
 141     case 32: return Assembler::AVX_256bit;
 142     case 64: return Assembler::AVX_512bit;
 143 
 144     default: {
 145       ShouldNotReachHere();
 146       return Assembler::AVX_NoVec;
 147     }
 148   }
 149 }
 150 
 151 // fast_lock and fast_unlock used by C2
 152 
 153 // Because the transitions from emitted code to the runtime
 154 // monitorenter/exit helper stubs are so slow it's critical that
 155 // we inline both the stack-locking fast path and the inflated fast path.
 156 //
 157 // See also: cmpFastLock and cmpFastUnlock.
 158 //
 159 // What follows is a specialized inline transliteration of the code
 160 // in enter() and exit(). If we're concerned about I$ bloat another
 161 // option would be to emit TrySlowEnter and TrySlowExit methods
 162 // at startup-time.  These methods would accept arguments as
 163 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 164 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 165 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 166 // In practice, however, the # of lock sites is bounded and is usually small.
 167 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 168 // if the processor uses simple bimodal branch predictors keyed by EIP
 169 // Since the helper routines would be called from multiple synchronization
 170 // sites.
 171 //
 172 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 173 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 174 // to those specialized methods.  That'd give us a mostly platform-independent
 175 // implementation that the JITs could optimize and inline at their pleasure.
 176 // Done correctly, the only time we'd need to cross to native could would be
 177 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 178 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 179 // (b) explicit barriers or fence operations.
 180 //
 181 // TODO:
 182 //
 183 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 184 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 185 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 186 //    the lock operators would typically be faster than reifying Self.
 187 //
 188 // *  Ideally I'd define the primitives as:
 189 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 190 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 191 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 192 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 193 //    Furthermore the register assignments are overconstrained, possibly resulting in
 194 //    sub-optimal code near the synchronization site.
 195 //
 196 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 197 //    Alternately, use a better sp-proximity test.
 198 //
 199 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 200 //    Either one is sufficient to uniquely identify a thread.
 201 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 202 //
 203 // *  Intrinsify notify() and notifyAll() for the common cases where the
 204 //    object is locked by the calling thread but the waitlist is empty.
 205 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 206 //
 207 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 208 //    But beware of excessive branch density on AMD Opterons.
 209 //
 210 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 211 //    or failure of the fast path.  If the fast path fails then we pass
 212 //    control to the slow path, typically in C.  In fast_lock and
 213 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 214 //    will emit a conditional branch immediately after the node.
 215 //    So we have branches to branches and lots of ICC.ZF games.
 216 //    Instead, it might be better to have C2 pass a "FailureLabel"
 217 //    into fast_lock and fast_unlock.  In the case of success, control
 218 //    will drop through the node.  ICC.ZF is undefined at exit.
 219 //    In the case of failure, the node will branch directly to the
 220 //    FailureLabel
 221 
 222 // obj: object to lock
 223 // box: on-stack box address -- KILLED
 224 // rax: tmp -- KILLED
 225 // t  : tmp -- KILLED
 226 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register rax_reg,
 227                                   Register t, Register thread) {
 228   assert(rax_reg == rax, "Used for CAS");
 229   assert_different_registers(obj, box, rax_reg, t, thread);
 230 
 231   // Handle inflated monitor.
 232   Label inflated;
 233   // Finish fast lock successfully. ZF value is irrelevant.
 234   Label locked;
 235   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 236   Label slow_path;
 237 
 238   if (UseObjectMonitorTable) {
 239     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 240     movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
 241   }
 242 
 243   if (DiagnoseSyncOnValueBasedClasses != 0) {
 244     load_klass(rax_reg, obj, t);
 245     testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 246     jcc(Assembler::notZero, slow_path);
 247   }
 248 
 249   const Register mark = t;
 250 
 251   { // Fast Lock
 252 
 253     Label push;
 254 
 255     const Register top = UseObjectMonitorTable ? rax_reg : box;
 256 
 257     // Load the mark.
 258     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 259 
 260     // Prefetch top.
 261     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 262 
 263     // Check for monitor (0b10).
 264     testptr(mark, markWord::monitor_value);
 265     jcc(Assembler::notZero, inflated);
 266 
 267     // Check if lock-stack is full.
 268     cmpl(top, LockStack::end_offset() - 1);
 269     jcc(Assembler::greater, slow_path);
 270 
 271     // Check if recursive.
 272     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 273     jccb(Assembler::equal, push);
 274 
 275     // Try to lock. Transition lock bits 0b01 => 0b00
 276     movptr(rax_reg, mark);
 277     orptr(rax_reg, markWord::unlocked_value);
 278     andptr(mark, ~(int32_t)markWord::unlocked_value);
 279     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 280     jcc(Assembler::notEqual, slow_path);
 281 
 282     if (UseObjectMonitorTable) {
 283       // Need to reload top, clobbered by CAS.
 284       movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 285     }
 286     bind(push);
 287     // After successful lock, push object on lock-stack.
 288     movptr(Address(thread, top), obj);
 289     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 290     jmp(locked);
 291   }
 292 
 293   { // Handle inflated monitor.
 294     bind(inflated);
 295 
 296     const Register monitor = t;
 297 
 298     if (!UseObjectMonitorTable) {
 299       assert(mark == monitor, "should be the same here");
 300     } else {
 301       const Register hash = t;
 302       Label monitor_found;
 303 
 304       // Look for the monitor in the om_cache.
 305 
 306       ByteSize cache_offset   = JavaThread::om_cache_oops_offset();
 307       ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
 308       const int num_unrolled  = OMCache::CAPACITY;
 309       for (int i = 0; i < num_unrolled; i++) {
 310         movptr(monitor, Address(thread,  cache_offset + monitor_offset));
 311         cmpptr(obj, Address(thread, cache_offset));
 312         jccb(Assembler::equal, monitor_found);
 313         cache_offset = cache_offset + OMCache::oop_to_oop_difference();
 314       }
 315 
 316       // Look for the monitor in the table.
 317 
 318       // Get the hash code.
 319       movptr(hash, Address(obj, oopDesc::mark_offset_in_bytes()));
 320       shrq(hash, markWord::hash_shift);
 321       andq(hash, markWord::hash_mask);
 322 
 323       // Get the table and calculate the bucket's address.
 324       lea(rax_reg, ExternalAddress(ObjectMonitorTable::current_table_address()));
 325       movptr(rax_reg, Address(rax_reg));
 326       andq(hash, Address(rax_reg, ObjectMonitorTable::table_capacity_mask_offset()));
 327       movptr(rax_reg, Address(rax_reg, ObjectMonitorTable::table_buckets_offset()));
 328 
 329       // Read the monitor from the bucket.
 330       movptr(monitor, Address(rax_reg, hash, Address::times_ptr));
 331 
 332       // Check if the monitor in the bucket is special (empty, tombstone or removed)
 333       cmpptr(monitor, ObjectMonitorTable::SpecialPointerValues::below_is_special);
 334       jcc(Assembler::below, slow_path);
 335 
 336       // Check if object matches.
 337       movptr(rax_reg, Address(monitor, ObjectMonitor::object_offset()));
 338       BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 339       bs_asm->try_resolve_weak_handle_in_c2(this, rax_reg, slow_path);
 340       cmpptr(rax_reg, obj);
 341       jcc(Assembler::notEqual, slow_path);
 342 
 343       bind(monitor_found);
 344     }
 345     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 346     const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 347     const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
 348 
 349     Label monitor_locked;
 350     // Lock the monitor.
 351 
 352     if (UseObjectMonitorTable) {
 353       // Cache the monitor for unlock before trashing box. On failure to acquire
 354       // the lock, the slow path will reset the entry accordingly (see CacheSetter).
 355       movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
 356     }
 357 
 358     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 359     xorptr(rax_reg, rax_reg);
 360     movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
 361     lock(); cmpxchgptr(box, owner_address);
 362     jccb(Assembler::equal, monitor_locked);
 363 
 364     // Check if recursive.
 365     cmpptr(box, rax_reg);
 366     jccb(Assembler::notEqual, slow_path);
 367 
 368     // Recursive.
 369     increment(recursions_address);
 370 
 371     bind(monitor_locked);
 372   }
 373 
 374   bind(locked);
 375   // Set ZF = 1
 376   xorl(rax_reg, rax_reg);
 377 
 378 #ifdef ASSERT
 379   // Check that locked label is reached with ZF set.
 380   Label zf_correct;
 381   Label zf_bad_zero;
 382   jcc(Assembler::zero, zf_correct);
 383   jmp(zf_bad_zero);
 384 #endif
 385 
 386   bind(slow_path);
 387 #ifdef ASSERT
 388   // Check that slow_path label is reached with ZF not set.
 389   jcc(Assembler::notZero, zf_correct);
 390   stop("Fast Lock ZF != 0");
 391   bind(zf_bad_zero);
 392   stop("Fast Lock ZF != 1");
 393   bind(zf_correct);
 394 #endif
 395   // C2 uses the value of ZF to determine the continuation.
 396 }
 397 
 398 // obj: object to lock
 399 // rax: tmp -- KILLED
 400 // t  : tmp - cannot be obj nor rax -- KILLED
 401 //
 402 // Some commentary on balanced locking:
 403 //
 404 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 405 // Methods that don't have provably balanced locking are forced to run in the
 406 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 407 // The interpreter provides two properties:
 408 // I1:  At return-time the interpreter automatically and quietly unlocks any
 409 //      objects acquired in the current activation (frame).  Recall that the
 410 //      interpreter maintains an on-stack list of locks currently held by
 411 //      a frame.
 412 // I2:  If a method attempts to unlock an object that is not held by the
 413 //      frame the interpreter throws IMSX.
 414 //
 415 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 416 // B() doesn't have provably balanced locking so it runs in the interpreter.
 417 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 418 // is still locked by A().
 419 //
 420 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface
 421 // Specification" states that an object locked by JNI's MonitorEnter should not be
 422 // unlocked by "normal" java-level locking and vice-versa.  The specification doesn't
 423 // specify what will occur if a program engages in such mixed-mode locking, however.
 424 // Arguably given that the spec legislates the JNI case as undefined our implementation
 425 // could reasonably *avoid* checking owner in fast_unlock().
 426 // In the interest of performance we elide m->Owner==Self check in unlock.
 427 // A perfectly viable alternative is to elide the owner check except when
 428 // Xcheck:jni is enabled.
 429 
 430 void C2_MacroAssembler::fast_unlock(Register obj, Register reg_rax, Register t, Register thread) {
 431   assert(reg_rax == rax, "Used for CAS");
 432   assert_different_registers(obj, reg_rax, t);
 433 
 434   // Handle inflated monitor.
 435   Label inflated, inflated_check_lock_stack;
 436   // Finish fast unlock successfully.  MUST jump with ZF == 1
 437   Label unlocked, slow_path;
 438 
 439   const Register mark = t;
 440   const Register monitor = t;
 441   const Register top = UseObjectMonitorTable ? t : reg_rax;
 442   const Register box = reg_rax;
 443 
 444   Label dummy;
 445   C2FastUnlockStub* stub = nullptr;
 446 
 447   if (!Compile::current()->output()->in_scratch_emit_size()) {
 448     stub = new (Compile::current()->comp_arena()) C2FastUnlockStub(obj, mark, reg_rax, thread);
 449     Compile::current()->output()->add_stub(stub);
 450   }
 451 
 452   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 453 
 454   { // Fast Unlock
 455 
 456     // Load top.
 457     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 458 
 459     if (!UseObjectMonitorTable) {
 460       // Prefetch mark.
 461       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 462     }
 463 
 464     // Check if obj is top of lock-stack.
 465     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 466     // Top of lock stack was not obj. Must be monitor.
 467     jcc(Assembler::notEqual, inflated_check_lock_stack);
 468 
 469     // Pop lock-stack.
 470     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 471     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 472 
 473     // Check if recursive.
 474     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 475     jcc(Assembler::equal, unlocked);
 476 
 477     // We elide the monitor check, let the CAS fail instead.
 478 
 479     if (UseObjectMonitorTable) {
 480       // Load mark.
 481       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 482     }
 483 
 484     // Try to unlock. Transition lock bits 0b00 => 0b01
 485     movptr(reg_rax, mark);
 486     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
 487     orptr(mark, markWord::unlocked_value);
 488     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 489     jcc(Assembler::notEqual, push_and_slow_path);
 490     jmp(unlocked);
 491   }
 492 
 493 
 494   { // Handle inflated monitor.
 495     bind(inflated_check_lock_stack);
 496 #ifdef ASSERT
 497     Label check_done;
 498     subl(top, oopSize);
 499     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
 500     jcc(Assembler::below, check_done);
 501     cmpptr(obj, Address(thread, top));
 502     jcc(Assembler::notEqual, inflated_check_lock_stack);
 503     stop("Fast Unlock lock on stack");
 504     bind(check_done);
 505     if (UseObjectMonitorTable) {
 506       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 507     }
 508     testptr(mark, markWord::monitor_value);
 509     jcc(Assembler::notZero, inflated);
 510     stop("Fast Unlock not monitor");
 511 #endif
 512 
 513     bind(inflated);
 514 
 515     if (!UseObjectMonitorTable) {
 516       assert(mark == monitor, "should be the same here");
 517     } else {
 518       // Uses ObjectMonitorTable.  Look for the monitor in our BasicLock on the stack.
 519       movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 520       // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
 521       cmpptr(monitor, alignof(ObjectMonitor*));
 522       jcc(Assembler::below, slow_path);
 523     }
 524     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 525     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
 526     const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
 527     const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag};
 528     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
 529 
 530     Label recursive;
 531 
 532     // Check if recursive.
 533     cmpptr(recursions_address, 0);
 534     jcc(Assembler::notZero, recursive);
 535 
 536     // Set owner to null.
 537     // Release to satisfy the JMM
 538     movptr(owner_address, NULL_WORD);
 539     // We need a full fence after clearing owner to avoid stranding.
 540     // StoreLoad achieves this.
 541     membar(StoreLoad);
 542 
 543     // Check if the entry_list is empty.
 544     cmpptr(entry_list_address, NULL_WORD);
 545     jcc(Assembler::zero, unlocked);    // If so we are done.
 546 
 547     // Check if there is a successor.
 548     cmpptr(succ_address, NULL_WORD);
 549     jcc(Assembler::notZero, unlocked); // If so we are done.
 550 
 551     // Save the monitor pointer in the current thread, so we can try to
 552     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 553     if (!UseObjectMonitorTable) {
 554       andptr(monitor, ~(int32_t)markWord::monitor_value);
 555     }
 556     movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
 557 
 558     orl(t, 1); // Fast Unlock ZF = 0
 559     jmpb(slow_path);
 560 
 561     // Recursive unlock.
 562     bind(recursive);
 563     decrement(recursions_address);
 564   }
 565 
 566   bind(unlocked);
 567   xorl(t, t); // Fast Unlock ZF = 1
 568 
 569 #ifdef ASSERT
 570   // Check that unlocked label is reached with ZF set.
 571   Label zf_correct;
 572   Label zf_bad_zero;
 573   jcc(Assembler::zero, zf_correct);
 574   jmp(zf_bad_zero);
 575 #endif
 576 
 577   bind(slow_path);
 578   if (stub != nullptr) {
 579     bind(stub->slow_path_continuation());
 580   }
 581 #ifdef ASSERT
 582   // Check that stub->continuation() label is reached with ZF not set.
 583   jcc(Assembler::notZero, zf_correct);
 584   stop("Fast Unlock ZF != 0");
 585   bind(zf_bad_zero);
 586   stop("Fast Unlock ZF != 1");
 587   bind(zf_correct);
 588 #endif
 589   // C2 uses the value of ZF to determine the continuation.
 590 }
 591 
 592 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
 593   fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
 594 }
 595 
 596 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) {
 597   const int framesize = Compile::current()->output()->frame_size_in_bytes();
 598   masm->movptr(dst, rsp);
 599   if (framesize > 2 * wordSize) {
 600     masm->addptr(dst, framesize - 2 * wordSize);
 601   }
 602 }
 603 
 604 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
 605   if (PreserveFramePointer) {
 606     // frame pointer is valid
 607 #ifdef ASSERT
 608     // Verify frame pointer value in rbp.
 609     reconstruct_frame_pointer_helper(this, rtmp);
 610     Label L_success;
 611     cmpq(rbp, rtmp);
 612     jccb(Assembler::equal, L_success);
 613     STOP("frame pointer mismatch");
 614     bind(L_success);
 615 #endif // ASSERT
 616   } else {
 617     reconstruct_frame_pointer_helper(this, rbp);
 618   }
 619 }
 620 
 621 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) {
 622   jint lo = t->_lo;
 623   jint hi = t->_hi;
 624   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi);
 625   if (t == TypeInt::INT) {
 626     return;
 627   }
 628 
 629   BLOCK_COMMENT("CastII {");
 630   Label fail;
 631   Label succeed;
 632 
 633   if (lo != min_jint) {
 634     cmpl(val, lo);
 635     jccb(Assembler::less, fail);
 636   }
 637   if (hi != max_jint) {
 638     cmpl(val, hi);
 639     jccb(Assembler::greater, fail);
 640   }
 641   jmpb(succeed);
 642 
 643   bind(fail);
 644   movl(c_rarg0, idx);
 645   movl(c_rarg1, val);
 646   movl(c_rarg2, lo);
 647   movl(c_rarg3, hi);
 648   reconstruct_frame_pointer(rscratch1);
 649   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range)));
 650   hlt();
 651   bind(succeed);
 652   BLOCK_COMMENT("} // CastII");
 653 }
 654 
 655 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
 656   fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
 657 }
 658 
 659 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) {
 660   jlong lo = t->_lo;
 661   jlong hi = t->_hi;
 662   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi);
 663   if (t == TypeLong::LONG) {
 664     return;
 665   }
 666 
 667   BLOCK_COMMENT("CastLL {");
 668   Label fail;
 669   Label succeed;
 670 
 671   auto cmp_val = [&](jlong bound) {
 672     if (is_simm32(bound)) {
 673       cmpq(val, checked_cast<int>(bound));
 674     } else {
 675       mov64(tmp, bound);
 676       cmpq(val, tmp);
 677     }
 678   };
 679 
 680   if (lo != min_jlong) {
 681     cmp_val(lo);
 682     jccb(Assembler::less, fail);
 683   }
 684   if (hi != max_jlong) {
 685     cmp_val(hi);
 686     jccb(Assembler::greater, fail);
 687   }
 688   jmpb(succeed);
 689 
 690   bind(fail);
 691   movl(c_rarg0, idx);
 692   movq(c_rarg1, val);
 693   mov64(c_rarg2, lo);
 694   mov64(c_rarg3, hi);
 695   reconstruct_frame_pointer(rscratch1);
 696   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range)));
 697   hlt();
 698   bind(succeed);
 699   BLOCK_COMMENT("} // CastLL");
 700 }
 701 
 702 //-------------------------------------------------------------------------------------------
 703 // Generic instructions support for use in .ad files C2 code generation
 704 
 705 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 706   if (dst != src) {
 707     movdqu(dst, src);
 708   }
 709   if (opcode == Op_AbsVD) {
 710     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 711   } else {
 712     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 713     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 714   }
 715 }
 716 
 717 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 718   if (opcode == Op_AbsVD) {
 719     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 720   } else {
 721     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 722     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 723   }
 724 }
 725 
 726 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 727   if (dst != src) {
 728     movdqu(dst, src);
 729   }
 730   if (opcode == Op_AbsVF) {
 731     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 732   } else {
 733     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 734     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 735   }
 736 }
 737 
 738 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 739   if (opcode == Op_AbsVF) {
 740     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 741   } else {
 742     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 743     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 744   }
 745 }
 746 
 747 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 748   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 749   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 750 
 751   if (opcode == Op_MinV) {
 752     if (elem_bt == T_BYTE) {
 753       pminsb(dst, src);
 754     } else if (elem_bt == T_SHORT) {
 755       pminsw(dst, src);
 756     } else if (elem_bt == T_INT) {
 757       pminsd(dst, src);
 758     } else {
 759       assert(elem_bt == T_LONG, "required");
 760       assert(tmp == xmm0, "required");
 761       assert_different_registers(dst, src, tmp);
 762       movdqu(xmm0, dst);
 763       pcmpgtq(xmm0, src);
 764       blendvpd(dst, src);  // xmm0 as mask
 765     }
 766   } else { // opcode == Op_MaxV
 767     if (elem_bt == T_BYTE) {
 768       pmaxsb(dst, src);
 769     } else if (elem_bt == T_SHORT) {
 770       pmaxsw(dst, src);
 771     } else if (elem_bt == T_INT) {
 772       pmaxsd(dst, src);
 773     } else {
 774       assert(elem_bt == T_LONG, "required");
 775       assert(tmp == xmm0, "required");
 776       assert_different_registers(dst, src, tmp);
 777       movdqu(xmm0, src);
 778       pcmpgtq(xmm0, dst);
 779       blendvpd(dst, src);  // xmm0 as mask
 780     }
 781   }
 782 }
 783 
 784 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 785                                   XMMRegister src1, Address src2, int vlen_enc) {
 786   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 787   if (opcode == Op_UMinV) {
 788     switch(elem_bt) {
 789       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 790       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 791       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 792       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 793       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 794     }
 795   } else {
 796     assert(opcode == Op_UMaxV, "required");
 797     switch(elem_bt) {
 798       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 799       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 800       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 801       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 802       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 803     }
 804   }
 805 }
 806 
 807 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
 808   // For optimality, leverage a full vector width of 512 bits
 809   // for operations over smaller vector sizes on AVX512 targets.
 810   if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 811     if (opcode == Op_UMaxV) {
 812       evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 813     } else {
 814       assert(opcode == Op_UMinV, "required");
 815       evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 816     }
 817   } else {
 818     // T1 = -1
 819     vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
 820     // T1 = -1 << 63
 821     vpsllq(xtmp1, xtmp1, 63, vlen_enc);
 822     // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
 823     vpaddq(xtmp2, xtmp1, src2, vlen_enc);
 824     // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
 825     vpaddq(xtmp1, xtmp1, src1, vlen_enc);
 826     // Mask = T2 > T1
 827     vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
 828     if (opcode == Op_UMaxV) {
 829       // Res = Mask ? Src2 : Src1
 830       vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
 831     } else {
 832       // Res = Mask ? Src1 : Src2
 833       vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
 834     }
 835   }
 836 }
 837 
 838 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 839                                   XMMRegister src1, XMMRegister src2, int vlen_enc) {
 840   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 841   if (opcode == Op_UMinV) {
 842     switch(elem_bt) {
 843       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 844       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 845       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 846       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 847       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 848     }
 849   } else {
 850     assert(opcode == Op_UMaxV, "required");
 851     switch(elem_bt) {
 852       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 853       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 854       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 855       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 856       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 857     }
 858   }
 859 }
 860 
 861 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 862                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 863                                  int vlen_enc) {
 864   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 865 
 866   if (opcode == Op_MinV) {
 867     if (elem_bt == T_BYTE) {
 868       vpminsb(dst, src1, src2, vlen_enc);
 869     } else if (elem_bt == T_SHORT) {
 870       vpminsw(dst, src1, src2, vlen_enc);
 871     } else if (elem_bt == T_INT) {
 872       vpminsd(dst, src1, src2, vlen_enc);
 873     } else {
 874       assert(elem_bt == T_LONG, "required");
 875       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 876         vpminsq(dst, src1, src2, vlen_enc);
 877       } else {
 878         assert_different_registers(dst, src1, src2);
 879         vpcmpgtq(dst, src1, src2, vlen_enc);
 880         vblendvpd(dst, src1, src2, dst, vlen_enc);
 881       }
 882     }
 883   } else { // opcode == Op_MaxV
 884     if (elem_bt == T_BYTE) {
 885       vpmaxsb(dst, src1, src2, vlen_enc);
 886     } else if (elem_bt == T_SHORT) {
 887       vpmaxsw(dst, src1, src2, vlen_enc);
 888     } else if (elem_bt == T_INT) {
 889       vpmaxsd(dst, src1, src2, vlen_enc);
 890     } else {
 891       assert(elem_bt == T_LONG, "required");
 892       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 893         vpmaxsq(dst, src1, src2, vlen_enc);
 894       } else {
 895         assert_different_registers(dst, src1, src2);
 896         vpcmpgtq(dst, src1, src2, vlen_enc);
 897         vblendvpd(dst, src2, src1, dst, vlen_enc);
 898       }
 899     }
 900   }
 901 }
 902 
 903 // Float/Double min max
 904 
 905 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
 906                                    XMMRegister dst, XMMRegister a, XMMRegister b,
 907                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
 908                                    int vlen_enc) {
 909   assert(UseAVX > 0, "required");
 910   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 911          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 912   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 913   assert_different_registers(a, tmp, atmp, btmp);
 914   assert_different_registers(b, tmp, atmp, btmp);
 915 
 916   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 917   bool is_double_word = is_double_word_type(elem_bt);
 918 
 919   /* Note on 'non-obvious' assembly sequence:
 920    *
 921    * While there are vminps/vmaxps instructions, there are two important differences between hardware
 922    * and Java on how they handle floats:
 923    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
 924    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
 925    *
 926    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
 927    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
 928    *                (only useful when signs differ, noop otherwise)
 929    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
 930 
 931    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
 932    *   btmp = (b < +0.0) ? a : b
 933    *   atmp = (b < +0.0) ? b : a
 934    *   Tmp  = Max_Float(atmp , btmp)
 935    *   Res  = (atmp == NaN) ? atmp : Tmp
 936    */
 937 
 938   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
 939   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
 940   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
 941   XMMRegister mask;
 942 
 943   if (!is_double_word && is_min) {
 944     mask = a;
 945     vblend = &MacroAssembler::vblendvps;
 946     vmaxmin = &MacroAssembler::vminps;
 947     vcmp = &MacroAssembler::vcmpps;
 948   } else if (!is_double_word && !is_min) {
 949     mask = b;
 950     vblend = &MacroAssembler::vblendvps;
 951     vmaxmin = &MacroAssembler::vmaxps;
 952     vcmp = &MacroAssembler::vcmpps;
 953   } else if (is_double_word && is_min) {
 954     mask = a;
 955     vblend = &MacroAssembler::vblendvpd;
 956     vmaxmin = &MacroAssembler::vminpd;
 957     vcmp = &MacroAssembler::vcmppd;
 958   } else {
 959     assert(is_double_word && !is_min, "sanity");
 960     mask = b;
 961     vblend = &MacroAssembler::vblendvpd;
 962     vmaxmin = &MacroAssembler::vmaxpd;
 963     vcmp = &MacroAssembler::vcmppd;
 964   }
 965 
 966   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
 967   XMMRegister maxmin, scratch;
 968   if (dst == btmp) {
 969     maxmin = btmp;
 970     scratch = tmp;
 971   } else {
 972     maxmin = tmp;
 973     scratch = btmp;
 974   }
 975 
 976   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
 977   if (precompute_mask && !is_double_word) {
 978     vpsrad(tmp, mask, 32, vlen_enc);
 979     mask = tmp;
 980   } else if (precompute_mask && is_double_word) {
 981     vpxor(tmp, tmp, tmp, vlen_enc);
 982     vpcmpgtq(tmp, tmp, mask, vlen_enc);
 983     mask = tmp;
 984   }
 985 
 986   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
 987   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
 988   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
 989   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 990   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
 991 }
 992 
 993 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
 994                                     XMMRegister dst, XMMRegister a, XMMRegister b,
 995                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
 996                                     int vlen_enc) {
 997   assert(UseAVX > 2, "required");
 998   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 999          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1000   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1001   assert_different_registers(dst, a, atmp, btmp);
1002   assert_different_registers(dst, b, atmp, btmp);
1003 
1004   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1005   bool is_double_word = is_double_word_type(elem_bt);
1006   bool merge = true;
1007 
1008   if (!is_double_word && is_min) {
1009     evpmovd2m(ktmp, a, vlen_enc);
1010     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1011     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1012     vminps(dst, atmp, btmp, vlen_enc);
1013     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1014     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1015   } else if (!is_double_word && !is_min) {
1016     evpmovd2m(ktmp, b, vlen_enc);
1017     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1018     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1019     vmaxps(dst, atmp, btmp, vlen_enc);
1020     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1021     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1022   } else if (is_double_word && is_min) {
1023     evpmovq2m(ktmp, a, vlen_enc);
1024     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1025     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1026     vminpd(dst, atmp, btmp, vlen_enc);
1027     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1028     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1029   } else {
1030     assert(is_double_word && !is_min, "sanity");
1031     evpmovq2m(ktmp, b, vlen_enc);
1032     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1033     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1034     vmaxpd(dst, atmp, btmp, vlen_enc);
1035     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1036     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1037   }
1038 }
1039 
1040 void C2_MacroAssembler::vminmax_fp(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1041                                    XMMRegister src1, XMMRegister src2, int vlen_enc) {
1042   assert(opc == Op_MinV || opc == Op_MinReductionV ||
1043          opc == Op_MaxV || opc == Op_MaxReductionV, "sanity");
1044 
1045   int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_2_MINMAX_MIN_COMPARE_SIGN
1046                                                          : AVX10_2_MINMAX_MAX_COMPARE_SIGN;
1047   if (elem_bt == T_FLOAT) {
1048     evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc);
1049   } else {
1050     assert(elem_bt == T_DOUBLE, "");
1051     evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc);
1052   }
1053 }
1054 
1055 // Float/Double signum
1056 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1057   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1058 
1059   Label DONE_LABEL;
1060 
1061   // Handle special cases +0.0/-0.0 and NaN, if argument is +0.0/-0.0 or NaN, return argument
1062   // If AVX10.2 (or newer) floating point comparison instructions used, SF=1 for equal and unordered cases
1063   // If other floating point comparison instructions used, ZF=1 for equal and unordered cases
1064   if (opcode == Op_SignumF) {
1065     if (VM_Version::supports_avx10_2()) {
1066       vucomxss(dst, zero);
1067       jcc(Assembler::negative, DONE_LABEL);
1068     } else {
1069       ucomiss(dst, zero);
1070       jcc(Assembler::equal, DONE_LABEL);
1071     }
1072     movflt(dst, one);
1073     jcc(Assembler::above, DONE_LABEL);
1074     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1075   } else if (opcode == Op_SignumD) {
1076     if (VM_Version::supports_avx10_2()) {
1077       vucomxsd(dst, zero);
1078       jcc(Assembler::negative, DONE_LABEL);
1079     } else {
1080       ucomisd(dst, zero);
1081       jcc(Assembler::equal, DONE_LABEL);
1082     }
1083     movdbl(dst, one);
1084     jcc(Assembler::above, DONE_LABEL);
1085     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1086   }
1087 
1088   bind(DONE_LABEL);
1089 }
1090 
1091 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1092   if (sign) {
1093     pmovsxbw(dst, src);
1094   } else {
1095     pmovzxbw(dst, src);
1096   }
1097 }
1098 
1099 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1100   if (sign) {
1101     vpmovsxbw(dst, src, vector_len);
1102   } else {
1103     vpmovzxbw(dst, src, vector_len);
1104   }
1105 }
1106 
1107 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1108   if (sign) {
1109     vpmovsxbd(dst, src, vector_len);
1110   } else {
1111     vpmovzxbd(dst, src, vector_len);
1112   }
1113 }
1114 
1115 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1116   if (sign) {
1117     vpmovsxwd(dst, src, vector_len);
1118   } else {
1119     vpmovzxwd(dst, src, vector_len);
1120   }
1121 }
1122 
1123 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1124                                      int shift, int vector_len) {
1125   if (opcode == Op_RotateLeftV) {
1126     if (etype == T_INT) {
1127       evprold(dst, src, shift, vector_len);
1128     } else {
1129       assert(etype == T_LONG, "expected type T_LONG");
1130       evprolq(dst, src, shift, vector_len);
1131     }
1132   } else {
1133     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1134     if (etype == T_INT) {
1135       evprord(dst, src, shift, vector_len);
1136     } else {
1137       assert(etype == T_LONG, "expected type T_LONG");
1138       evprorq(dst, src, shift, vector_len);
1139     }
1140   }
1141 }
1142 
1143 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1144                                      XMMRegister shift, int vector_len) {
1145   if (opcode == Op_RotateLeftV) {
1146     if (etype == T_INT) {
1147       evprolvd(dst, src, shift, vector_len);
1148     } else {
1149       assert(etype == T_LONG, "expected type T_LONG");
1150       evprolvq(dst, src, shift, vector_len);
1151     }
1152   } else {
1153     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1154     if (etype == T_INT) {
1155       evprorvd(dst, src, shift, vector_len);
1156     } else {
1157       assert(etype == T_LONG, "expected type T_LONG");
1158       evprorvq(dst, src, shift, vector_len);
1159     }
1160   }
1161 }
1162 
1163 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1164   if (opcode == Op_RShiftVI) {
1165     psrad(dst, shift);
1166   } else if (opcode == Op_LShiftVI) {
1167     pslld(dst, shift);
1168   } else {
1169     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1170     psrld(dst, shift);
1171   }
1172 }
1173 
1174 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1175   switch (opcode) {
1176     case Op_RShiftVI:  psrad(dst, shift); break;
1177     case Op_LShiftVI:  pslld(dst, shift); break;
1178     case Op_URShiftVI: psrld(dst, shift); break;
1179 
1180     default: assert(false, "%s", NodeClassNames[opcode]);
1181   }
1182 }
1183 
1184 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1185   if (opcode == Op_RShiftVI) {
1186     vpsrad(dst, nds, shift, vector_len);
1187   } else if (opcode == Op_LShiftVI) {
1188     vpslld(dst, nds, shift, vector_len);
1189   } else {
1190     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1191     vpsrld(dst, nds, shift, vector_len);
1192   }
1193 }
1194 
1195 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1196   switch (opcode) {
1197     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1198     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1199     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1200 
1201     default: assert(false, "%s", NodeClassNames[opcode]);
1202   }
1203 }
1204 
1205 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1206   switch (opcode) {
1207     case Op_RShiftVB:  // fall-through
1208     case Op_RShiftVS:  psraw(dst, shift); break;
1209 
1210     case Op_LShiftVB:  // fall-through
1211     case Op_LShiftVS:  psllw(dst, shift);   break;
1212 
1213     case Op_URShiftVS: // fall-through
1214     case Op_URShiftVB: psrlw(dst, shift);  break;
1215 
1216     default: assert(false, "%s", NodeClassNames[opcode]);
1217   }
1218 }
1219 
1220 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1221   switch (opcode) {
1222     case Op_RShiftVB:  // fall-through
1223     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1224 
1225     case Op_LShiftVB:  // fall-through
1226     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1227 
1228     case Op_URShiftVS: // fall-through
1229     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1230 
1231     default: assert(false, "%s", NodeClassNames[opcode]);
1232   }
1233 }
1234 
1235 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1236   switch (opcode) {
1237     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1238     case Op_LShiftVL:  psllq(dst, shift); break;
1239     case Op_URShiftVL: psrlq(dst, shift); break;
1240 
1241     default: assert(false, "%s", NodeClassNames[opcode]);
1242   }
1243 }
1244 
1245 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1246   if (opcode == Op_RShiftVL) {
1247     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1248   } else if (opcode == Op_LShiftVL) {
1249     psllq(dst, shift);
1250   } else {
1251     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1252     psrlq(dst, shift);
1253   }
1254 }
1255 
1256 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1257   switch (opcode) {
1258     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1259     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1260     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1261 
1262     default: assert(false, "%s", NodeClassNames[opcode]);
1263   }
1264 }
1265 
1266 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1267   if (opcode == Op_RShiftVL) {
1268     evpsraq(dst, nds, shift, vector_len);
1269   } else if (opcode == Op_LShiftVL) {
1270     vpsllq(dst, nds, shift, vector_len);
1271   } else {
1272     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1273     vpsrlq(dst, nds, shift, vector_len);
1274   }
1275 }
1276 
1277 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1278   switch (opcode) {
1279     case Op_RShiftVB:  // fall-through
1280     case Op_RShiftVS:  // fall-through
1281     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1282 
1283     case Op_LShiftVB:  // fall-through
1284     case Op_LShiftVS:  // fall-through
1285     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1286 
1287     case Op_URShiftVB: // fall-through
1288     case Op_URShiftVS: // fall-through
1289     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1290 
1291     default: assert(false, "%s", NodeClassNames[opcode]);
1292   }
1293 }
1294 
1295 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1296   switch (opcode) {
1297     case Op_RShiftVB:  // fall-through
1298     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1299 
1300     case Op_LShiftVB:  // fall-through
1301     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1302 
1303     case Op_URShiftVB: // fall-through
1304     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1305 
1306     default: assert(false, "%s", NodeClassNames[opcode]);
1307   }
1308 }
1309 
1310 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1311   assert(UseAVX >= 2, "required");
1312   switch (opcode) {
1313     case Op_RShiftVL: {
1314       if (UseAVX > 2) {
1315         assert(tmp == xnoreg, "not used");
1316         if (!VM_Version::supports_avx512vl()) {
1317           vlen_enc = Assembler::AVX_512bit;
1318         }
1319         evpsravq(dst, src, shift, vlen_enc);
1320       } else {
1321         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1322         vpsrlvq(dst, src, shift, vlen_enc);
1323         vpsrlvq(tmp, tmp, shift, vlen_enc);
1324         vpxor(dst, dst, tmp, vlen_enc);
1325         vpsubq(dst, dst, tmp, vlen_enc);
1326       }
1327       break;
1328     }
1329     case Op_LShiftVL: {
1330       assert(tmp == xnoreg, "not used");
1331       vpsllvq(dst, src, shift, vlen_enc);
1332       break;
1333     }
1334     case Op_URShiftVL: {
1335       assert(tmp == xnoreg, "not used");
1336       vpsrlvq(dst, src, shift, vlen_enc);
1337       break;
1338     }
1339     default: assert(false, "%s", NodeClassNames[opcode]);
1340   }
1341 }
1342 
1343 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1344 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1345   assert(opcode == Op_LShiftVB ||
1346          opcode == Op_RShiftVB ||
1347          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1348   bool sign = (opcode != Op_URShiftVB);
1349   assert(vector_len == 0, "required");
1350   vextendbd(sign, dst, src, 1);
1351   vpmovzxbd(vtmp, shift, 1);
1352   varshiftd(opcode, dst, dst, vtmp, 1);
1353   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1354   vextracti128_high(vtmp, dst);
1355   vpackusdw(dst, dst, vtmp, 0);
1356 }
1357 
1358 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1359 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1360   assert(opcode == Op_LShiftVB ||
1361          opcode == Op_RShiftVB ||
1362          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1363   bool sign = (opcode != Op_URShiftVB);
1364   int ext_vector_len = vector_len + 1;
1365   vextendbw(sign, dst, src, ext_vector_len);
1366   vpmovzxbw(vtmp, shift, ext_vector_len);
1367   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1368   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1369   if (vector_len == 0) {
1370     vextracti128_high(vtmp, dst);
1371     vpackuswb(dst, dst, vtmp, vector_len);
1372   } else {
1373     vextracti64x4_high(vtmp, dst);
1374     vpackuswb(dst, dst, vtmp, vector_len);
1375     vpermq(dst, dst, 0xD8, vector_len);
1376   }
1377 }
1378 
1379 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1380   switch(typ) {
1381     case T_BYTE:
1382       pinsrb(dst, val, idx);
1383       break;
1384     case T_SHORT:
1385       pinsrw(dst, val, idx);
1386       break;
1387     case T_INT:
1388       pinsrd(dst, val, idx);
1389       break;
1390     case T_LONG:
1391       pinsrq(dst, val, idx);
1392       break;
1393     default:
1394       assert(false,"Should not reach here.");
1395       break;
1396   }
1397 }
1398 
1399 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1400   switch(typ) {
1401     case T_BYTE:
1402       vpinsrb(dst, src, val, idx);
1403       break;
1404     case T_SHORT:
1405       vpinsrw(dst, src, val, idx);
1406       break;
1407     case T_INT:
1408       vpinsrd(dst, src, val, idx);
1409       break;
1410     case T_LONG:
1411       vpinsrq(dst, src, val, idx);
1412       break;
1413     default:
1414       assert(false,"Should not reach here.");
1415       break;
1416   }
1417 }
1418 
1419 void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst,
1420                                          Register base, Register idx_base,
1421                                          Register mask, Register mask_idx,
1422                                          Register rtmp, int vlen_enc) {
1423   vpxor(dst, dst, dst, vlen_enc);
1424   if (elem_bt == T_SHORT) {
1425     for (int i = 0; i < 4; i++) {
1426       // dst[i] = mask[i] ? src[idx_base[i]] : 0
1427       Label skip_load;
1428       btq(mask, mask_idx);
1429       jccb(Assembler::carryClear, skip_load);
1430       movl(rtmp, Address(idx_base, i * 4));
1431       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1432       bind(skip_load);
1433       incq(mask_idx);
1434     }
1435   } else {
1436     assert(elem_bt == T_BYTE, "");
1437     for (int i = 0; i < 8; i++) {
1438       // dst[i] = mask[i] ? src[idx_base[i]] : 0
1439       Label skip_load;
1440       btq(mask, mask_idx);
1441       jccb(Assembler::carryClear, skip_load);
1442       movl(rtmp, Address(idx_base, i * 4));
1443       pinsrb(dst, Address(base, rtmp), i);
1444       bind(skip_load);
1445       incq(mask_idx);
1446     }
1447   }
1448 }
1449 
1450 void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst,
1451                                   Register base, Register idx_base,
1452                                   Register rtmp, int vlen_enc) {
1453   vpxor(dst, dst, dst, vlen_enc);
1454   if (elem_bt == T_SHORT) {
1455     for (int i = 0; i < 4; i++) {
1456       // dst[i] = src[idx_base[i]]
1457       movl(rtmp, Address(idx_base, i * 4));
1458       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1459     }
1460   } else {
1461     assert(elem_bt == T_BYTE, "");
1462     for (int i = 0; i < 8; i++) {
1463       // dst[i] = src[idx_base[i]]
1464       movl(rtmp, Address(idx_base, i * 4));
1465       pinsrb(dst, Address(base, rtmp), i);
1466     }
1467   }
1468 }
1469 
1470 /*
1471  * Gather using hybrid algorithm, first partially unroll scalar loop
1472  * to accumulate values from gather indices into a quad-word(64bit) slice.
1473  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1474  * permutation to place the slice into appropriate vector lane
1475  * locations in destination vector. Following pseudo code describes the
1476  * algorithm in detail:
1477  *
1478  * DST_VEC = ZERO_VEC
1479  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1480  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1481  * FOREACH_ITER:
1482  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1483  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1484  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1485  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1486  *
1487  * With each iteration, doubleword permute indices (0,1) corresponding
1488  * to gathered quadword gets right shifted by two lane positions.
1489  *
1490  */
1491 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1492                                         Register base, Register idx_base,
1493                                         Register mask, XMMRegister xtmp1,
1494                                         XMMRegister xtmp2, XMMRegister temp_dst,
1495                                         Register rtmp, Register mask_idx,
1496                                         Register length, int vector_len, int vlen_enc) {
1497   Label GATHER8_LOOP;
1498   assert(is_subword_type(elem_ty), "");
1499   movl(length, vector_len);
1500   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1501   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1502   vallones(xtmp2, vlen_enc);
1503   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1504   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1505   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1506 
1507   bind(GATHER8_LOOP);
1508     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1509     if (mask == noreg) {
1510       vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc);
1511     } else {
1512       vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc);
1513     }
1514     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1515     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1516     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1517     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1518     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1519     vpor(dst, dst, temp_dst, vlen_enc);
1520     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1521     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1522     jcc(Assembler::notEqual, GATHER8_LOOP);
1523 }
1524 
1525 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1526   switch(typ) {
1527     case T_INT:
1528       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1529       break;
1530     case T_FLOAT:
1531       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1532       break;
1533     case T_LONG:
1534       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1535       break;
1536     case T_DOUBLE:
1537       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1538       break;
1539     default:
1540       assert(false,"Should not reach here.");
1541       break;
1542   }
1543 }
1544 
1545 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1546   switch(typ) {
1547     case T_INT:
1548       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1549       break;
1550     case T_FLOAT:
1551       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1552       break;
1553     case T_LONG:
1554       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1555       break;
1556     case T_DOUBLE:
1557       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1558       break;
1559     default:
1560       assert(false,"Should not reach here.");
1561       break;
1562   }
1563 }
1564 
1565 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1566   switch(typ) {
1567     case T_INT:
1568       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1569       break;
1570     case T_FLOAT:
1571       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1572       break;
1573     case T_LONG:
1574       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1575       break;
1576     case T_DOUBLE:
1577       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1578       break;
1579     default:
1580       assert(false,"Should not reach here.");
1581       break;
1582   }
1583 }
1584 
1585 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1586   if (vlen_in_bytes <= 16) {
1587     pxor (dst, dst);
1588     psubb(dst, src);
1589     switch (elem_bt) {
1590       case T_BYTE:   /* nothing to do */ break;
1591       case T_SHORT:  pmovsxbw(dst, dst); break;
1592       case T_INT:    pmovsxbd(dst, dst); break;
1593       case T_FLOAT:  pmovsxbd(dst, dst); break;
1594       case T_LONG:   pmovsxbq(dst, dst); break;
1595       case T_DOUBLE: pmovsxbq(dst, dst); break;
1596 
1597       default: assert(false, "%s", type2name(elem_bt));
1598     }
1599   } else {
1600     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1601     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1602 
1603     vpxor (dst, dst, dst, vlen_enc);
1604     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1605 
1606     switch (elem_bt) {
1607       case T_BYTE:   /* nothing to do */            break;
1608       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1609       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1610       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1611       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1612       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1613 
1614       default: assert(false, "%s", type2name(elem_bt));
1615     }
1616   }
1617 }
1618 
1619 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1620   if (novlbwdq) {
1621     vpmovsxbd(xtmp, src, vlen_enc);
1622     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1623             Assembler::eq, true, vlen_enc, noreg);
1624   } else {
1625     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1626     vpsubb(xtmp, xtmp, src, vlen_enc);
1627     evpmovb2m(dst, xtmp, vlen_enc);
1628   }
1629 }
1630 
1631 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1632   if (is_integral_type(bt)) {
1633     switch (vlen_in_bytes) {
1634       case 4:  movdl(dst, src);   break;
1635       case 8:  movq(dst, src);    break;
1636       case 16: movdqu(dst, src);  break;
1637       case 32: vmovdqu(dst, src); break;
1638       case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1639       default: ShouldNotReachHere();
1640     }
1641   } else {
1642     switch (vlen_in_bytes) {
1643       case 4:  movflt(dst, src); break;
1644       case 8:  movdbl(dst, src); break;
1645       case 16: movups(dst, src); break;
1646       case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1647       case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1648       default: ShouldNotReachHere();
1649     }
1650   }
1651 }
1652 
1653 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1654   assert(rscratch != noreg || always_reachable(src), "missing");
1655 
1656   if (reachable(src)) {
1657     load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1658   } else {
1659     lea(rscratch, src);
1660     load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1661   }
1662 }
1663 
1664 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1665   int vlen_enc = vector_length_encoding(vlen);
1666   if (VM_Version::supports_avx()) {
1667     if (bt == T_LONG) {
1668       if (VM_Version::supports_avx2()) {
1669         vpbroadcastq(dst, src, vlen_enc);
1670       } else {
1671         vmovddup(dst, src, vlen_enc);
1672       }
1673     } else if (bt == T_DOUBLE) {
1674       if (vlen_enc != Assembler::AVX_128bit) {
1675         vbroadcastsd(dst, src, vlen_enc, noreg);
1676       } else {
1677         vmovddup(dst, src, vlen_enc);
1678       }
1679     } else {
1680       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1681         vpbroadcastd(dst, src, vlen_enc);
1682       } else {
1683         vbroadcastss(dst, src, vlen_enc);
1684       }
1685     }
1686   } else if (VM_Version::supports_sse3()) {
1687     movddup(dst, src);
1688   } else {
1689     load_vector(bt, dst, src, vlen);
1690   }
1691 }
1692 
1693 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1694   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1695   int offset = exact_log2(type2aelembytes(bt)) << 6;
1696   if (is_floating_point_type(bt)) {
1697     offset += 128;
1698   }
1699   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1700   load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1701 }
1702 
1703 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1704 
1705 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1706   int vector_len = Assembler::AVX_128bit;
1707 
1708   switch (opcode) {
1709     case Op_AndReductionV:  pand(dst, src); break;
1710     case Op_OrReductionV:   por (dst, src); break;
1711     case Op_XorReductionV:  pxor(dst, src); break;
1712     case Op_MinReductionV:
1713       switch (typ) {
1714         case T_BYTE:        pminsb(dst, src); break;
1715         case T_SHORT:       pminsw(dst, src); break;
1716         case T_INT:         pminsd(dst, src); break;
1717         case T_LONG:        assert(UseAVX > 2, "required");
1718                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1719         default:            assert(false, "wrong type");
1720       }
1721       break;
1722     case Op_MaxReductionV:
1723       switch (typ) {
1724         case T_BYTE:        pmaxsb(dst, src); break;
1725         case T_SHORT:       pmaxsw(dst, src); break;
1726         case T_INT:         pmaxsd(dst, src); break;
1727         case T_LONG:        assert(UseAVX > 2, "required");
1728                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1729         default:            assert(false, "wrong type");
1730       }
1731       break;
1732     case Op_UMinReductionV:
1733       switch (typ) {
1734         case T_BYTE:        vpminub(dst, dst, src, Assembler::AVX_128bit); break;
1735         case T_SHORT:       vpminuw(dst, dst, src, Assembler::AVX_128bit); break;
1736         case T_INT:         vpminud(dst, dst, src, Assembler::AVX_128bit); break;
1737         case T_LONG:        evpminuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break;
1738         default:            assert(false, "wrong type");
1739       }
1740       break;
1741     case Op_UMaxReductionV:
1742       switch (typ) {
1743         case T_BYTE:        vpmaxub(dst, dst, src, Assembler::AVX_128bit); break;
1744         case T_SHORT:       vpmaxuw(dst, dst, src, Assembler::AVX_128bit); break;
1745         case T_INT:         vpmaxud(dst, dst, src, Assembler::AVX_128bit); break;
1746         case T_LONG:        evpmaxuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break;
1747         default:            assert(false, "wrong type");
1748       }
1749       break;
1750     case Op_AddReductionVF: addss(dst, src); break;
1751     case Op_AddReductionVD: addsd(dst, src); break;
1752     case Op_AddReductionVI:
1753       switch (typ) {
1754         case T_BYTE:        paddb(dst, src); break;
1755         case T_SHORT:       paddw(dst, src); break;
1756         case T_INT:         paddd(dst, src); break;
1757         default:            assert(false, "wrong type");
1758       }
1759       break;
1760     case Op_AddReductionVL: paddq(dst, src); break;
1761     case Op_MulReductionVF: mulss(dst, src); break;
1762     case Op_MulReductionVD: mulsd(dst, src); break;
1763     case Op_MulReductionVI:
1764       switch (typ) {
1765         case T_SHORT:       pmullw(dst, src); break;
1766         case T_INT:         pmulld(dst, src); break;
1767         default:            assert(false, "wrong type");
1768       }
1769       break;
1770     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1771                             evpmullq(dst, dst, src, vector_len); break;
1772     default:                assert(false, "wrong opcode");
1773   }
1774 }
1775 
1776 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1777   switch (opcode) {
1778     case Op_AddReductionVF: addps(dst, src); break;
1779     case Op_AddReductionVD: addpd(dst, src); break;
1780     case Op_MulReductionVF: mulps(dst, src); break;
1781     case Op_MulReductionVD: mulpd(dst, src); break;
1782     default:                assert(false, "%s", NodeClassNames[opcode]);
1783   }
1784 }
1785 
1786 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1787   int vector_len = Assembler::AVX_256bit;
1788 
1789   switch (opcode) {
1790     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1791     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1792     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1793     case Op_MinReductionV:
1794       switch (typ) {
1795         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1796         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1797         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1798         case T_LONG:        assert(UseAVX > 2, "required");
1799                             vpminsq(dst, src1, src2, vector_len); break;
1800         default:            assert(false, "wrong type");
1801       }
1802       break;
1803     case Op_MaxReductionV:
1804       switch (typ) {
1805         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1806         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1807         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1808         case T_LONG:        assert(UseAVX > 2, "required");
1809                             vpmaxsq(dst, src1, src2, vector_len); break;
1810         default:            assert(false, "wrong type");
1811       }
1812       break;
1813     case Op_UMinReductionV:
1814       switch (typ) {
1815         case T_BYTE:        vpminub(dst, src1, src2, vector_len); break;
1816         case T_SHORT:       vpminuw(dst, src1, src2, vector_len); break;
1817         case T_INT:         vpminud(dst, src1, src2, vector_len); break;
1818         case T_LONG:        evpminuq(dst, k0, src1, src2, true, vector_len); break;
1819         default:            assert(false, "wrong type");
1820       }
1821       break;
1822     case Op_UMaxReductionV:
1823       switch (typ) {
1824         case T_BYTE:        vpmaxub(dst, src1, src2, vector_len); break;
1825         case T_SHORT:       vpmaxuw(dst, src1, src2, vector_len); break;
1826         case T_INT:         vpmaxud(dst, src1, src2, vector_len); break;
1827         case T_LONG:        evpmaxuq(dst, k0, src1, src2, true, vector_len); break;
1828         default:            assert(false, "wrong type");
1829       }
1830       break;
1831     case Op_AddReductionVI:
1832       switch (typ) {
1833         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1834         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1835         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1836         default:            assert(false, "wrong type");
1837       }
1838       break;
1839     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1840     case Op_MulReductionVI:
1841       switch (typ) {
1842         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1843         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1844         default:            assert(false, "wrong type");
1845       }
1846       break;
1847     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1848     default:                assert(false, "wrong opcode");
1849   }
1850 }
1851 
1852 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1853   int vector_len = Assembler::AVX_256bit;
1854 
1855   switch (opcode) {
1856     case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1857     case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1858     case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1859     case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1860     default:                assert(false, "%s", NodeClassNames[opcode]);
1861   }
1862 }
1863 
1864 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1865                                   XMMRegister dst, XMMRegister src,
1866                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1867   switch (opcode) {
1868     case Op_AddReductionVF:
1869     case Op_MulReductionVF:
1870       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1871       break;
1872 
1873     case Op_AddReductionVD:
1874     case Op_MulReductionVD:
1875       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1876       break;
1877 
1878     default: assert(false, "wrong opcode");
1879   }
1880 }
1881 
1882 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1883                                             XMMRegister dst, XMMRegister src,
1884                                             XMMRegister vtmp1, XMMRegister vtmp2) {
1885   switch (opcode) {
1886     case Op_AddReductionVF:
1887     case Op_MulReductionVF:
1888       unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1889       break;
1890 
1891     case Op_AddReductionVD:
1892     case Op_MulReductionVD:
1893       unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1894       break;
1895 
1896     default: assert(false, "%s", NodeClassNames[opcode]);
1897   }
1898 }
1899 
1900 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1901                              Register dst, Register src1, XMMRegister src2,
1902                              XMMRegister vtmp1, XMMRegister vtmp2) {
1903   switch (vlen) {
1904     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1905     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1906     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1907     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1908 
1909     default: assert(false, "wrong vector length");
1910   }
1911 }
1912 
1913 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1914                              Register dst, Register src1, XMMRegister src2,
1915                              XMMRegister vtmp1, XMMRegister vtmp2) {
1916   switch (vlen) {
1917     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1918     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1919     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1920     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1921 
1922     default: assert(false, "wrong vector length");
1923   }
1924 }
1925 
1926 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1927                              Register dst, Register src1, XMMRegister src2,
1928                              XMMRegister vtmp1, XMMRegister vtmp2) {
1929   switch (vlen) {
1930     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1931     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1932     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1933     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1934 
1935     default: assert(false, "wrong vector length");
1936   }
1937 }
1938 
1939 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1940                              Register dst, Register src1, XMMRegister src2,
1941                              XMMRegister vtmp1, XMMRegister vtmp2) {
1942   switch (vlen) {
1943     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1944     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1945     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1946     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1947 
1948     default: assert(false, "wrong vector length");
1949   }
1950 }
1951 
1952 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1953                              Register dst, Register src1, XMMRegister src2,
1954                              XMMRegister vtmp1, XMMRegister vtmp2) {
1955   switch (vlen) {
1956     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1957     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1958     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1959 
1960     default: assert(false, "wrong vector length");
1961   }
1962 }
1963 
1964 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1965   switch (vlen) {
1966     case 2:
1967       assert(vtmp2 == xnoreg, "");
1968       reduce2F(opcode, dst, src, vtmp1);
1969       break;
1970     case 4:
1971       assert(vtmp2 == xnoreg, "");
1972       reduce4F(opcode, dst, src, vtmp1);
1973       break;
1974     case 8:
1975       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1976       break;
1977     case 16:
1978       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1979       break;
1980     default: assert(false, "wrong vector length");
1981   }
1982 }
1983 
1984 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1985   switch (vlen) {
1986     case 2:
1987       assert(vtmp2 == xnoreg, "");
1988       reduce2D(opcode, dst, src, vtmp1);
1989       break;
1990     case 4:
1991       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1992       break;
1993     case 8:
1994       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1995       break;
1996     default: assert(false, "wrong vector length");
1997   }
1998 }
1999 
2000 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2001   switch (vlen) {
2002     case 2:
2003       assert(vtmp1 == xnoreg, "");
2004       assert(vtmp2 == xnoreg, "");
2005       unorderedReduce2F(opcode, dst, src);
2006       break;
2007     case 4:
2008       assert(vtmp2 == xnoreg, "");
2009       unorderedReduce4F(opcode, dst, src, vtmp1);
2010       break;
2011     case 8:
2012       unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
2013       break;
2014     case 16:
2015       unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
2016       break;
2017     default: assert(false, "wrong vector length");
2018   }
2019 }
2020 
2021 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2022   switch (vlen) {
2023     case 2:
2024       assert(vtmp1 == xnoreg, "");
2025       assert(vtmp2 == xnoreg, "");
2026       unorderedReduce2D(opcode, dst, src);
2027       break;
2028     case 4:
2029       assert(vtmp2 == xnoreg, "");
2030       unorderedReduce4D(opcode, dst, src, vtmp1);
2031       break;
2032     case 8:
2033       unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2034       break;
2035     default: assert(false, "wrong vector length");
2036   }
2037 }
2038 
2039 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2040   if (opcode == Op_AddReductionVI) {
2041     if (vtmp1 != src2) {
2042       movdqu(vtmp1, src2);
2043     }
2044     phaddd(vtmp1, vtmp1);
2045   } else {
2046     pshufd(vtmp1, src2, 0x1);
2047     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2048   }
2049   movdl(vtmp2, src1);
2050   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2051   movdl(dst, vtmp1);
2052 }
2053 
2054 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2055   if (opcode == Op_AddReductionVI) {
2056     if (vtmp1 != src2) {
2057       movdqu(vtmp1, src2);
2058     }
2059     phaddd(vtmp1, src2);
2060     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2061   } else {
2062     pshufd(vtmp2, src2, 0xE);
2063     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2064     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2065   }
2066 }
2067 
2068 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2069   if (opcode == Op_AddReductionVI) {
2070     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2071     vextracti128_high(vtmp2, vtmp1);
2072     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2073     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2074   } else {
2075     vextracti128_high(vtmp1, src2);
2076     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2077     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2078   }
2079 }
2080 
2081 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2082   vextracti64x4_high(vtmp2, src2);
2083   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2084   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2085 }
2086 
2087 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2088   pshufd(vtmp2, src2, 0x1);
2089   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2090   movdqu(vtmp1, vtmp2);
2091   psrldq(vtmp1, 2);
2092   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2093   movdqu(vtmp2, vtmp1);
2094   psrldq(vtmp2, 1);
2095   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2096   movdl(vtmp2, src1);
2097   if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) {
2098     pmovzxbd(vtmp1, vtmp1);
2099   } else {
2100     pmovsxbd(vtmp1, vtmp1);
2101   }
2102   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2103   pextrb(dst, vtmp1, 0x0);
2104   movsbl(dst, dst);
2105 }
2106 
2107 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2108   pshufd(vtmp1, src2, 0xE);
2109   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2110   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2111 }
2112 
2113 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2114   vextracti128_high(vtmp2, src2);
2115   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2116   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2117 }
2118 
2119 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2120   vextracti64x4_high(vtmp1, src2);
2121   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2122   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2123 }
2124 
2125 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2126   pmovsxbw(vtmp2, src2);
2127   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2128 }
2129 
2130 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2131   if (UseAVX > 1) {
2132     int vector_len = Assembler::AVX_256bit;
2133     vpmovsxbw(vtmp1, src2, vector_len);
2134     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2135   } else {
2136     pmovsxbw(vtmp2, src2);
2137     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2138     pshufd(vtmp2, src2, 0x1);
2139     pmovsxbw(vtmp2, src2);
2140     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2141   }
2142 }
2143 
2144 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2145   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2146     int vector_len = Assembler::AVX_512bit;
2147     vpmovsxbw(vtmp1, src2, vector_len);
2148     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2149   } else {
2150     assert(UseAVX >= 2,"Should not reach here.");
2151     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2152     vextracti128_high(vtmp2, src2);
2153     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2154   }
2155 }
2156 
2157 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2158   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2159   vextracti64x4_high(vtmp2, src2);
2160   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2161 }
2162 
2163 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2164   if (opcode == Op_AddReductionVI) {
2165     if (vtmp1 != src2) {
2166       movdqu(vtmp1, src2);
2167     }
2168     phaddw(vtmp1, vtmp1);
2169     phaddw(vtmp1, vtmp1);
2170   } else {
2171     pshufd(vtmp2, src2, 0x1);
2172     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2173     movdqu(vtmp1, vtmp2);
2174     psrldq(vtmp1, 2);
2175     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2176   }
2177   movdl(vtmp2, src1);
2178   if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) {
2179     pmovzxwd(vtmp1, vtmp1);
2180   } else {
2181     pmovsxwd(vtmp1, vtmp1);
2182   }
2183   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2184   pextrw(dst, vtmp1, 0x0);
2185   movswl(dst, dst);
2186 }
2187 
2188 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2189   if (opcode == Op_AddReductionVI) {
2190     if (vtmp1 != src2) {
2191       movdqu(vtmp1, src2);
2192     }
2193     phaddw(vtmp1, src2);
2194   } else {
2195     pshufd(vtmp1, src2, 0xE);
2196     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2197   }
2198   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2199 }
2200 
2201 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2202   if (opcode == Op_AddReductionVI) {
2203     int vector_len = Assembler::AVX_256bit;
2204     vphaddw(vtmp2, src2, src2, vector_len);
2205     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2206   } else {
2207     vextracti128_high(vtmp2, src2);
2208     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2209   }
2210   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2211 }
2212 
2213 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2214   int vector_len = Assembler::AVX_256bit;
2215   vextracti64x4_high(vtmp1, src2);
2216   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2217   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2218 }
2219 
2220 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2221   pshufd(vtmp2, src2, 0xE);
2222   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2223   movdq(vtmp1, src1);
2224   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2225   movdq(dst, vtmp1);
2226 }
2227 
2228 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2229   vextracti128_high(vtmp1, src2);
2230   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2231   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2232 }
2233 
2234 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2235   vextracti64x4_high(vtmp2, src2);
2236   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2237   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2238 }
2239 
2240 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2241   mov64(temp, -1L);
2242   bzhiq(temp, temp, len);
2243   kmovql(dst, temp);
2244 }
2245 
2246 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2247   reduce_operation_128(T_FLOAT, opcode, dst, src);
2248   pshufd(vtmp, src, 0x1);
2249   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2250 }
2251 
2252 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2253   reduce2F(opcode, dst, src, vtmp);
2254   pshufd(vtmp, src, 0x2);
2255   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2256   pshufd(vtmp, src, 0x3);
2257   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2258 }
2259 
2260 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2261   reduce4F(opcode, dst, src, vtmp2);
2262   vextractf128_high(vtmp2, src);
2263   reduce4F(opcode, dst, vtmp2, vtmp1);
2264 }
2265 
2266 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2267   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2268   vextracti64x4_high(vtmp1, src);
2269   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2270 }
2271 
2272 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2273   pshufd(dst, src, 0x1);
2274   reduce_operation_128(T_FLOAT, opcode, dst, src);
2275 }
2276 
2277 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2278   pshufd(vtmp, src, 0xE);
2279   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2280   unorderedReduce2F(opcode, dst, vtmp);
2281 }
2282 
2283 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2284   vextractf128_high(vtmp1, src);
2285   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2286   unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2287 }
2288 
2289 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2290   vextractf64x4_high(vtmp2, src);
2291   unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2292   unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2293 }
2294 
2295 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2296   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2297   pshufd(vtmp, src, 0xE);
2298   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2299 }
2300 
2301 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2302   reduce2D(opcode, dst, src, vtmp2);
2303   vextractf128_high(vtmp2, src);
2304   reduce2D(opcode, dst, vtmp2, vtmp1);
2305 }
2306 
2307 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2308   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2309   vextracti64x4_high(vtmp1, src);
2310   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2311 }
2312 
2313 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2314   pshufd(dst, src, 0xE);
2315   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2316 }
2317 
2318 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2319   vextractf128_high(vtmp, src);
2320   unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2321   unorderedReduce2D(opcode, dst, vtmp);
2322 }
2323 
2324 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2325   vextractf64x4_high(vtmp2, src);
2326   unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2327   unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2328 }
2329 
2330 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2331   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2332 }
2333 
2334 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2335   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2336 }
2337 
2338 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2339   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2340 }
2341 
2342 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2343                                  int vec_enc) {
2344   switch(elem_bt) {
2345     case T_INT:
2346     case T_FLOAT:
2347       vmaskmovps(dst, src, mask, vec_enc);
2348       break;
2349     case T_LONG:
2350     case T_DOUBLE:
2351       vmaskmovpd(dst, src, mask, vec_enc);
2352       break;
2353     default:
2354       fatal("Unsupported type %s", type2name(elem_bt));
2355       break;
2356   }
2357 }
2358 
2359 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2360                                  int vec_enc) {
2361   switch(elem_bt) {
2362     case T_INT:
2363     case T_FLOAT:
2364       vmaskmovps(dst, src, mask, vec_enc);
2365       break;
2366     case T_LONG:
2367     case T_DOUBLE:
2368       vmaskmovpd(dst, src, mask, vec_enc);
2369       break;
2370     default:
2371       fatal("Unsupported type %s", type2name(elem_bt));
2372       break;
2373   }
2374 }
2375 
2376 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2377                                           XMMRegister dst, XMMRegister src,
2378                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2379                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2380   const int permconst[] = {1, 14};
2381   XMMRegister wsrc = src;
2382   XMMRegister wdst = xmm_0;
2383   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2384 
2385   int vlen_enc = Assembler::AVX_128bit;
2386   if (vlen == 16) {
2387     vlen_enc = Assembler::AVX_256bit;
2388   }
2389 
2390   for (int i = log2(vlen) - 1; i >=0; i--) {
2391     if (i == 0 && !is_dst_valid) {
2392       wdst = dst;
2393     }
2394     if (i == 3) {
2395       vextracti64x4_high(wtmp, wsrc);
2396     } else if (i == 2) {
2397       vextracti128_high(wtmp, wsrc);
2398     } else { // i = [0,1]
2399       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2400     }
2401 
2402     if (VM_Version::supports_avx10_2()) {
2403       vminmax_fp(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc);
2404     } else {
2405       vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2406     }
2407     wsrc = wdst;
2408     vlen_enc = Assembler::AVX_128bit;
2409   }
2410   if (is_dst_valid) {
2411     if (VM_Version::supports_avx10_2()) {
2412       vminmax_fp(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit);
2413     } else {
2414       vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2415     }
2416   }
2417 }
2418 
2419 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2420                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2421                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2422   XMMRegister wsrc = src;
2423   XMMRegister wdst = xmm_0;
2424   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2425   int vlen_enc = Assembler::AVX_128bit;
2426   if (vlen == 8) {
2427     vlen_enc = Assembler::AVX_256bit;
2428   }
2429   for (int i = log2(vlen) - 1; i >=0; i--) {
2430     if (i == 0 && !is_dst_valid) {
2431       wdst = dst;
2432     }
2433     if (i == 1) {
2434       vextracti128_high(wtmp, wsrc);
2435     } else if (i == 2) {
2436       vextracti64x4_high(wtmp, wsrc);
2437     } else {
2438       assert(i == 0, "%d", i);
2439       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2440     }
2441 
2442     if (VM_Version::supports_avx10_2()) {
2443       vminmax_fp(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc);
2444     } else {
2445       vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2446     }
2447 
2448     wsrc = wdst;
2449     vlen_enc = Assembler::AVX_128bit;
2450   }
2451 
2452   if (is_dst_valid) {
2453     if (VM_Version::supports_avx10_2()) {
2454       vminmax_fp(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit);
2455     } else {
2456       vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2457     }
2458   }
2459 }
2460 
2461 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2462   switch (bt) {
2463     case T_BYTE:  pextrb(dst, src, idx); break;
2464     case T_SHORT: pextrw(dst, src, idx); break;
2465     case T_INT:   pextrd(dst, src, idx); break;
2466     case T_LONG:  pextrq(dst, src, idx); break;
2467 
2468     default:
2469       assert(false,"Should not reach here.");
2470       break;
2471   }
2472 }
2473 
2474 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2475   int esize =  type2aelembytes(typ);
2476   int elem_per_lane = 16/esize;
2477   int lane = elemindex / elem_per_lane;
2478   int eindex = elemindex % elem_per_lane;
2479 
2480   if (lane >= 2) {
2481     assert(UseAVX > 2, "required");
2482     vextractf32x4(dst, src, lane & 3);
2483     return dst;
2484   } else if (lane > 0) {
2485     assert(UseAVX > 0, "required");
2486     vextractf128(dst, src, lane);
2487     return dst;
2488   } else {
2489     return src;
2490   }
2491 }
2492 
2493 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2494   if (typ == T_BYTE) {
2495     movsbl(dst, dst);
2496   } else if (typ == T_SHORT) {
2497     movswl(dst, dst);
2498   }
2499 }
2500 
2501 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2502   int esize =  type2aelembytes(typ);
2503   int elem_per_lane = 16/esize;
2504   int eindex = elemindex % elem_per_lane;
2505   assert(is_integral_type(typ),"required");
2506 
2507   if (eindex == 0) {
2508     if (typ == T_LONG) {
2509       movq(dst, src);
2510     } else {
2511       movdl(dst, src);
2512       movsxl(typ, dst);
2513     }
2514   } else {
2515     extract(typ, dst, src, eindex);
2516     movsxl(typ, dst);
2517   }
2518 }
2519 
2520 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2521   int esize =  type2aelembytes(typ);
2522   int elem_per_lane = 16/esize;
2523   int eindex = elemindex % elem_per_lane;
2524   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2525 
2526   if (eindex == 0) {
2527     movq(dst, src);
2528   } else {
2529     if (typ == T_FLOAT) {
2530       if (UseAVX == 0) {
2531         movdqu(dst, src);
2532         shufps(dst, dst, eindex);
2533       } else {
2534         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2535       }
2536     } else {
2537       if (UseAVX == 0) {
2538         movdqu(dst, src);
2539         psrldq(dst, eindex*esize);
2540       } else {
2541         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2542       }
2543       movq(dst, dst);
2544     }
2545   }
2546   // Zero upper bits
2547   if (typ == T_FLOAT) {
2548     if (UseAVX == 0) {
2549       assert(vtmp != xnoreg, "required.");
2550       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2551       pand(dst, vtmp);
2552     } else {
2553       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2554     }
2555   }
2556 }
2557 
2558 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2559   switch(typ) {
2560     case T_BYTE:
2561     case T_BOOLEAN:
2562       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2563       break;
2564     case T_SHORT:
2565     case T_CHAR:
2566       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2567       break;
2568     case T_INT:
2569     case T_FLOAT:
2570       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2571       break;
2572     case T_LONG:
2573     case T_DOUBLE:
2574       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2575       break;
2576     default:
2577       assert(false,"Should not reach here.");
2578       break;
2579   }
2580 }
2581 
2582 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2583   assert(rscratch != noreg || always_reachable(src2), "missing");
2584 
2585   switch(typ) {
2586     case T_BOOLEAN:
2587     case T_BYTE:
2588       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2589       break;
2590     case T_CHAR:
2591     case T_SHORT:
2592       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2593       break;
2594     case T_INT:
2595     case T_FLOAT:
2596       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2597       break;
2598     case T_LONG:
2599     case T_DOUBLE:
2600       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2601       break;
2602     default:
2603       assert(false,"Should not reach here.");
2604       break;
2605   }
2606 }
2607 
2608 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2609   switch(typ) {
2610     case T_BYTE:
2611       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2612       break;
2613     case T_SHORT:
2614       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2615       break;
2616     case T_INT:
2617     case T_FLOAT:
2618       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2619       break;
2620     case T_LONG:
2621     case T_DOUBLE:
2622       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2623       break;
2624     default:
2625       assert(false,"Should not reach here.");
2626       break;
2627   }
2628 }
2629 
2630 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2631   assert(vlen_in_bytes <= 32, "");
2632   int esize = type2aelembytes(bt);
2633   if (vlen_in_bytes == 32) {
2634     assert(vtmp == xnoreg, "required.");
2635     if (esize >= 4) {
2636       vtestps(src1, src2, AVX_256bit);
2637     } else {
2638       vptest(src1, src2, AVX_256bit);
2639     }
2640     return;
2641   }
2642   if (vlen_in_bytes < 16) {
2643     // Duplicate the lower part to fill the whole register,
2644     // Don't need to do so for src2
2645     assert(vtmp != xnoreg, "required");
2646     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2647     pshufd(vtmp, src1, shuffle_imm);
2648   } else {
2649     assert(vtmp == xnoreg, "required");
2650     vtmp = src1;
2651   }
2652   if (esize >= 4 && VM_Version::supports_avx()) {
2653     vtestps(vtmp, src2, AVX_128bit);
2654   } else {
2655     ptest(vtmp, src2);
2656   }
2657 }
2658 
2659 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2660 #ifdef ASSERT
2661   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2662   bool is_bw_supported = VM_Version::supports_avx512bw();
2663   if (is_bw && !is_bw_supported) {
2664     assert(vlen_enc != Assembler::AVX_512bit, "required");
2665     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2666            "XMM register should be 0-15");
2667   }
2668 #endif // ASSERT
2669   switch (elem_bt) {
2670     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2671     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2672     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2673     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2674     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2675     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2676     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2677   }
2678 }
2679 
2680 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2681   assert(UseAVX >= 2, "required");
2682   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2683   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2684   if ((UseAVX > 2) &&
2685       (!is_bw || VM_Version::supports_avx512bw()) &&
2686       (!is_vl || VM_Version::supports_avx512vl())) {
2687     switch (elem_bt) {
2688       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2689       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2690       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2691       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2692       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2693     }
2694   } else {
2695     assert(vlen_enc != Assembler::AVX_512bit, "required");
2696     assert((dst->encoding() < 16),"XMM register should be 0-15");
2697     switch (elem_bt) {
2698       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2699       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2700       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2701       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2702       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2703       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2704       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2705     }
2706   }
2707 }
2708 
2709 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2710   switch (to_elem_bt) {
2711     case T_SHORT:
2712       vpmovsxbw(dst, src, vlen_enc);
2713       break;
2714     case T_INT:
2715       vpmovsxbd(dst, src, vlen_enc);
2716       break;
2717     case T_FLOAT:
2718       vpmovsxbd(dst, src, vlen_enc);
2719       vcvtdq2ps(dst, dst, vlen_enc);
2720       break;
2721     case T_LONG:
2722       vpmovsxbq(dst, src, vlen_enc);
2723       break;
2724     case T_DOUBLE: {
2725       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2726       vpmovsxbd(dst, src, mid_vlen_enc);
2727       vcvtdq2pd(dst, dst, vlen_enc);
2728       break;
2729     }
2730     default:
2731       fatal("Unsupported type %s", type2name(to_elem_bt));
2732       break;
2733   }
2734 }
2735 
2736 //-------------------------------------------------------------------------------------------
2737 
2738 // IndexOf for constant substrings with size >= 8 chars
2739 // which don't need to be loaded through stack.
2740 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2741                                          Register cnt1, Register cnt2,
2742                                          int int_cnt2,  Register result,
2743                                          XMMRegister vec, Register tmp,
2744                                          int ae) {
2745   ShortBranchVerifier sbv(this);
2746   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2747   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2748 
2749   // This method uses the pcmpestri instruction with bound registers
2750   //   inputs:
2751   //     xmm - substring
2752   //     rax - substring length (elements count)
2753   //     mem - scanned string
2754   //     rdx - string length (elements count)
2755   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2756   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2757   //   outputs:
2758   //     rcx - matched index in string
2759   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2760   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2761   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2762   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2763   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2764 
2765   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2766         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2767         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2768 
2769   // Note, inline_string_indexOf() generates checks:
2770   // if (substr.count > string.count) return -1;
2771   // if (substr.count == 0) return 0;
2772   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2773 
2774   // Load substring.
2775   if (ae == StrIntrinsicNode::UL) {
2776     pmovzxbw(vec, Address(str2, 0));
2777   } else {
2778     movdqu(vec, Address(str2, 0));
2779   }
2780   movl(cnt2, int_cnt2);
2781   movptr(result, str1); // string addr
2782 
2783   if (int_cnt2 > stride) {
2784     jmpb(SCAN_TO_SUBSTR);
2785 
2786     // Reload substr for rescan, this code
2787     // is executed only for large substrings (> 8 chars)
2788     bind(RELOAD_SUBSTR);
2789     if (ae == StrIntrinsicNode::UL) {
2790       pmovzxbw(vec, Address(str2, 0));
2791     } else {
2792       movdqu(vec, Address(str2, 0));
2793     }
2794     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2795 
2796     bind(RELOAD_STR);
2797     // We came here after the beginning of the substring was
2798     // matched but the rest of it was not so we need to search
2799     // again. Start from the next element after the previous match.
2800 
2801     // cnt2 is number of substring reminding elements and
2802     // cnt1 is number of string reminding elements when cmp failed.
2803     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2804     subl(cnt1, cnt2);
2805     addl(cnt1, int_cnt2);
2806     movl(cnt2, int_cnt2); // Now restore cnt2
2807 
2808     decrementl(cnt1);     // Shift to next element
2809     cmpl(cnt1, cnt2);
2810     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2811 
2812     addptr(result, (1<<scale1));
2813 
2814   } // (int_cnt2 > 8)
2815 
2816   // Scan string for start of substr in 16-byte vectors
2817   bind(SCAN_TO_SUBSTR);
2818   pcmpestri(vec, Address(result, 0), mode);
2819   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2820   subl(cnt1, stride);
2821   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2822   cmpl(cnt1, cnt2);
2823   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2824   addptr(result, 16);
2825   jmpb(SCAN_TO_SUBSTR);
2826 
2827   // Found a potential substr
2828   bind(FOUND_CANDIDATE);
2829   // Matched whole vector if first element matched (tmp(rcx) == 0).
2830   if (int_cnt2 == stride) {
2831     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2832   } else { // int_cnt2 > 8
2833     jccb(Assembler::overflow, FOUND_SUBSTR);
2834   }
2835   // After pcmpestri tmp(rcx) contains matched element index
2836   // Compute start addr of substr
2837   lea(result, Address(result, tmp, scale1));
2838 
2839   // Make sure string is still long enough
2840   subl(cnt1, tmp);
2841   cmpl(cnt1, cnt2);
2842   if (int_cnt2 == stride) {
2843     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2844   } else { // int_cnt2 > 8
2845     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2846   }
2847   // Left less then substring.
2848 
2849   bind(RET_NOT_FOUND);
2850   movl(result, -1);
2851   jmp(EXIT);
2852 
2853   if (int_cnt2 > stride) {
2854     // This code is optimized for the case when whole substring
2855     // is matched if its head is matched.
2856     bind(MATCH_SUBSTR_HEAD);
2857     pcmpestri(vec, Address(result, 0), mode);
2858     // Reload only string if does not match
2859     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2860 
2861     Label CONT_SCAN_SUBSTR;
2862     // Compare the rest of substring (> 8 chars).
2863     bind(FOUND_SUBSTR);
2864     // First 8 chars are already matched.
2865     negptr(cnt2);
2866     addptr(cnt2, stride);
2867 
2868     bind(SCAN_SUBSTR);
2869     subl(cnt1, stride);
2870     cmpl(cnt2, -stride); // Do not read beyond substring
2871     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2872     // Back-up strings to avoid reading beyond substring:
2873     // cnt1 = cnt1 - cnt2 + 8
2874     addl(cnt1, cnt2); // cnt2 is negative
2875     addl(cnt1, stride);
2876     movl(cnt2, stride); negptr(cnt2);
2877     bind(CONT_SCAN_SUBSTR);
2878     if (int_cnt2 < (int)G) {
2879       int tail_off1 = int_cnt2<<scale1;
2880       int tail_off2 = int_cnt2<<scale2;
2881       if (ae == StrIntrinsicNode::UL) {
2882         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2883       } else {
2884         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2885       }
2886       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2887     } else {
2888       // calculate index in register to avoid integer overflow (int_cnt2*2)
2889       movl(tmp, int_cnt2);
2890       addptr(tmp, cnt2);
2891       if (ae == StrIntrinsicNode::UL) {
2892         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2893       } else {
2894         movdqu(vec, Address(str2, tmp, scale2, 0));
2895       }
2896       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2897     }
2898     // Need to reload strings pointers if not matched whole vector
2899     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2900     addptr(cnt2, stride);
2901     jcc(Assembler::negative, SCAN_SUBSTR);
2902     // Fall through if found full substring
2903 
2904   } // (int_cnt2 > 8)
2905 
2906   bind(RET_FOUND);
2907   // Found result if we matched full small substring.
2908   // Compute substr offset
2909   subptr(result, str1);
2910   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2911     shrl(result, 1); // index
2912   }
2913   bind(EXIT);
2914 
2915 } // string_indexofC8
2916 
2917 // Small strings are loaded through stack if they cross page boundary.
2918 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2919                                        Register cnt1, Register cnt2,
2920                                        int int_cnt2,  Register result,
2921                                        XMMRegister vec, Register tmp,
2922                                        int ae) {
2923   ShortBranchVerifier sbv(this);
2924   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2925   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2926 
2927   //
2928   // int_cnt2 is length of small (< 8 chars) constant substring
2929   // or (-1) for non constant substring in which case its length
2930   // is in cnt2 register.
2931   //
2932   // Note, inline_string_indexOf() generates checks:
2933   // if (substr.count > string.count) return -1;
2934   // if (substr.count == 0) return 0;
2935   //
2936   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2937   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2938   // This method uses the pcmpestri instruction with bound registers
2939   //   inputs:
2940   //     xmm - substring
2941   //     rax - substring length (elements count)
2942   //     mem - scanned string
2943   //     rdx - string length (elements count)
2944   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2945   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2946   //   outputs:
2947   //     rcx - matched index in string
2948   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2949   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2950   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2951   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2952 
2953   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2954         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2955         FOUND_CANDIDATE;
2956 
2957   { //========================================================
2958     // We don't know where these strings are located
2959     // and we can't read beyond them. Load them through stack.
2960     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2961 
2962     movptr(tmp, rsp); // save old SP
2963 
2964     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2965       if (int_cnt2 == (1>>scale2)) { // One byte
2966         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2967         load_unsigned_byte(result, Address(str2, 0));
2968         movdl(vec, result); // move 32 bits
2969       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2970         // Not enough header space in 32-bit VM: 12+3 = 15.
2971         movl(result, Address(str2, -1));
2972         shrl(result, 8);
2973         movdl(vec, result); // move 32 bits
2974       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2975         load_unsigned_short(result, Address(str2, 0));
2976         movdl(vec, result); // move 32 bits
2977       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2978         movdl(vec, Address(str2, 0)); // move 32 bits
2979       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2980         movq(vec, Address(str2, 0));  // move 64 bits
2981       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2982         // Array header size is 12 bytes in 32-bit VM
2983         // + 6 bytes for 3 chars == 18 bytes,
2984         // enough space to load vec and shift.
2985         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2986         if (ae == StrIntrinsicNode::UL) {
2987           int tail_off = int_cnt2-8;
2988           pmovzxbw(vec, Address(str2, tail_off));
2989           psrldq(vec, -2*tail_off);
2990         }
2991         else {
2992           int tail_off = int_cnt2*(1<<scale2);
2993           movdqu(vec, Address(str2, tail_off-16));
2994           psrldq(vec, 16-tail_off);
2995         }
2996       }
2997     } else { // not constant substring
2998       cmpl(cnt2, stride);
2999       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3000 
3001       // We can read beyond string if srt+16 does not cross page boundary
3002       // since heaps are aligned and mapped by pages.
3003       assert(os::vm_page_size() < (int)G, "default page should be small");
3004       movl(result, str2); // We need only low 32 bits
3005       andl(result, ((int)os::vm_page_size()-1));
3006       cmpl(result, ((int)os::vm_page_size()-16));
3007       jccb(Assembler::belowEqual, CHECK_STR);
3008 
3009       // Move small strings to stack to allow load 16 bytes into vec.
3010       subptr(rsp, 16);
3011       int stk_offset = wordSize-(1<<scale2);
3012       push(cnt2);
3013 
3014       bind(COPY_SUBSTR);
3015       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3016         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3017         movb(Address(rsp, cnt2, scale2, stk_offset), result);
3018       } else if (ae == StrIntrinsicNode::UU) {
3019         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3020         movw(Address(rsp, cnt2, scale2, stk_offset), result);
3021       }
3022       decrement(cnt2);
3023       jccb(Assembler::notZero, COPY_SUBSTR);
3024 
3025       pop(cnt2);
3026       movptr(str2, rsp);  // New substring address
3027     } // non constant
3028 
3029     bind(CHECK_STR);
3030     cmpl(cnt1, stride);
3031     jccb(Assembler::aboveEqual, BIG_STRINGS);
3032 
3033     // Check cross page boundary.
3034     movl(result, str1); // We need only low 32 bits
3035     andl(result, ((int)os::vm_page_size()-1));
3036     cmpl(result, ((int)os::vm_page_size()-16));
3037     jccb(Assembler::belowEqual, BIG_STRINGS);
3038 
3039     subptr(rsp, 16);
3040     int stk_offset = -(1<<scale1);
3041     if (int_cnt2 < 0) { // not constant
3042       push(cnt2);
3043       stk_offset += wordSize;
3044     }
3045     movl(cnt2, cnt1);
3046 
3047     bind(COPY_STR);
3048     if (ae == StrIntrinsicNode::LL) {
3049       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3050       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3051     } else {
3052       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3053       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3054     }
3055     decrement(cnt2);
3056     jccb(Assembler::notZero, COPY_STR);
3057 
3058     if (int_cnt2 < 0) { // not constant
3059       pop(cnt2);
3060     }
3061     movptr(str1, rsp);  // New string address
3062 
3063     bind(BIG_STRINGS);
3064     // Load substring.
3065     if (int_cnt2 < 0) { // -1
3066       if (ae == StrIntrinsicNode::UL) {
3067         pmovzxbw(vec, Address(str2, 0));
3068       } else {
3069         movdqu(vec, Address(str2, 0));
3070       }
3071       push(cnt2);       // substr count
3072       push(str2);       // substr addr
3073       push(str1);       // string addr
3074     } else {
3075       // Small (< 8 chars) constant substrings are loaded already.
3076       movl(cnt2, int_cnt2);
3077     }
3078     push(tmp);  // original SP
3079 
3080   } // Finished loading
3081 
3082   //========================================================
3083   // Start search
3084   //
3085 
3086   movptr(result, str1); // string addr
3087 
3088   if (int_cnt2  < 0) {  // Only for non constant substring
3089     jmpb(SCAN_TO_SUBSTR);
3090 
3091     // SP saved at sp+0
3092     // String saved at sp+1*wordSize
3093     // Substr saved at sp+2*wordSize
3094     // Substr count saved at sp+3*wordSize
3095 
3096     // Reload substr for rescan, this code
3097     // is executed only for large substrings (> 8 chars)
3098     bind(RELOAD_SUBSTR);
3099     movptr(str2, Address(rsp, 2*wordSize));
3100     movl(cnt2, Address(rsp, 3*wordSize));
3101     if (ae == StrIntrinsicNode::UL) {
3102       pmovzxbw(vec, Address(str2, 0));
3103     } else {
3104       movdqu(vec, Address(str2, 0));
3105     }
3106     // We came here after the beginning of the substring was
3107     // matched but the rest of it was not so we need to search
3108     // again. Start from the next element after the previous match.
3109     subptr(str1, result); // Restore counter
3110     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3111       shrl(str1, 1);
3112     }
3113     addl(cnt1, str1);
3114     decrementl(cnt1);   // Shift to next element
3115     cmpl(cnt1, cnt2);
3116     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3117 
3118     addptr(result, (1<<scale1));
3119   } // non constant
3120 
3121   // Scan string for start of substr in 16-byte vectors
3122   bind(SCAN_TO_SUBSTR);
3123   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3124   pcmpestri(vec, Address(result, 0), mode);
3125   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3126   subl(cnt1, stride);
3127   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3128   cmpl(cnt1, cnt2);
3129   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3130   addptr(result, 16);
3131 
3132   bind(ADJUST_STR);
3133   cmpl(cnt1, stride); // Do not read beyond string
3134   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3135   // Back-up string to avoid reading beyond string.
3136   lea(result, Address(result, cnt1, scale1, -16));
3137   movl(cnt1, stride);
3138   jmpb(SCAN_TO_SUBSTR);
3139 
3140   // Found a potential substr
3141   bind(FOUND_CANDIDATE);
3142   // After pcmpestri tmp(rcx) contains matched element index
3143 
3144   // Make sure string is still long enough
3145   subl(cnt1, tmp);
3146   cmpl(cnt1, cnt2);
3147   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3148   // Left less then substring.
3149 
3150   bind(RET_NOT_FOUND);
3151   movl(result, -1);
3152   jmp(CLEANUP);
3153 
3154   bind(FOUND_SUBSTR);
3155   // Compute start addr of substr
3156   lea(result, Address(result, tmp, scale1));
3157   if (int_cnt2 > 0) { // Constant substring
3158     // Repeat search for small substring (< 8 chars)
3159     // from new point without reloading substring.
3160     // Have to check that we don't read beyond string.
3161     cmpl(tmp, stride-int_cnt2);
3162     jccb(Assembler::greater, ADJUST_STR);
3163     // Fall through if matched whole substring.
3164   } else { // non constant
3165     assert(int_cnt2 == -1, "should be != 0");
3166 
3167     addl(tmp, cnt2);
3168     // Found result if we matched whole substring.
3169     cmpl(tmp, stride);
3170     jcc(Assembler::lessEqual, RET_FOUND);
3171 
3172     // Repeat search for small substring (<= 8 chars)
3173     // from new point 'str1' without reloading substring.
3174     cmpl(cnt2, stride);
3175     // Have to check that we don't read beyond string.
3176     jccb(Assembler::lessEqual, ADJUST_STR);
3177 
3178     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3179     // Compare the rest of substring (> 8 chars).
3180     movptr(str1, result);
3181 
3182     cmpl(tmp, cnt2);
3183     // First 8 chars are already matched.
3184     jccb(Assembler::equal, CHECK_NEXT);
3185 
3186     bind(SCAN_SUBSTR);
3187     pcmpestri(vec, Address(str1, 0), mode);
3188     // Need to reload strings pointers if not matched whole vector
3189     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3190 
3191     bind(CHECK_NEXT);
3192     subl(cnt2, stride);
3193     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3194     addptr(str1, 16);
3195     if (ae == StrIntrinsicNode::UL) {
3196       addptr(str2, 8);
3197     } else {
3198       addptr(str2, 16);
3199     }
3200     subl(cnt1, stride);
3201     cmpl(cnt2, stride); // Do not read beyond substring
3202     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3203     // Back-up strings to avoid reading beyond substring.
3204 
3205     if (ae == StrIntrinsicNode::UL) {
3206       lea(str2, Address(str2, cnt2, scale2, -8));
3207       lea(str1, Address(str1, cnt2, scale1, -16));
3208     } else {
3209       lea(str2, Address(str2, cnt2, scale2, -16));
3210       lea(str1, Address(str1, cnt2, scale1, -16));
3211     }
3212     subl(cnt1, cnt2);
3213     movl(cnt2, stride);
3214     addl(cnt1, stride);
3215     bind(CONT_SCAN_SUBSTR);
3216     if (ae == StrIntrinsicNode::UL) {
3217       pmovzxbw(vec, Address(str2, 0));
3218     } else {
3219       movdqu(vec, Address(str2, 0));
3220     }
3221     jmp(SCAN_SUBSTR);
3222 
3223     bind(RET_FOUND_LONG);
3224     movptr(str1, Address(rsp, wordSize));
3225   } // non constant
3226 
3227   bind(RET_FOUND);
3228   // Compute substr offset
3229   subptr(result, str1);
3230   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3231     shrl(result, 1); // index
3232   }
3233   bind(CLEANUP);
3234   pop(rsp); // restore SP
3235 
3236 } // string_indexof
3237 
3238 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3239                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3240   ShortBranchVerifier sbv(this);
3241   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3242 
3243   int stride = 8;
3244 
3245   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3246         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3247         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3248         FOUND_SEQ_CHAR, DONE_LABEL;
3249 
3250   movptr(result, str1);
3251   if (UseAVX >= 2) {
3252     cmpl(cnt1, stride);
3253     jcc(Assembler::less, SCAN_TO_CHAR);
3254     cmpl(cnt1, 2*stride);
3255     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3256     movdl(vec1, ch);
3257     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3258     vpxor(vec2, vec2);
3259     movl(tmp, cnt1);
3260     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3261     andl(cnt1,0x0000000F);  //tail count (in chars)
3262 
3263     bind(SCAN_TO_16_CHAR_LOOP);
3264     vmovdqu(vec3, Address(result, 0));
3265     vpcmpeqw(vec3, vec3, vec1, 1);
3266     vptest(vec2, vec3);
3267     jcc(Assembler::carryClear, FOUND_CHAR);
3268     addptr(result, 32);
3269     subl(tmp, 2*stride);
3270     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3271     jmp(SCAN_TO_8_CHAR);
3272     bind(SCAN_TO_8_CHAR_INIT);
3273     movdl(vec1, ch);
3274     pshuflw(vec1, vec1, 0x00);
3275     pshufd(vec1, vec1, 0);
3276     pxor(vec2, vec2);
3277   }
3278   bind(SCAN_TO_8_CHAR);
3279   cmpl(cnt1, stride);
3280   jcc(Assembler::less, SCAN_TO_CHAR);
3281   if (UseAVX < 2) {
3282     movdl(vec1, ch);
3283     pshuflw(vec1, vec1, 0x00);
3284     pshufd(vec1, vec1, 0);
3285     pxor(vec2, vec2);
3286   }
3287   movl(tmp, cnt1);
3288   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3289   andl(cnt1,0x00000007);  //tail count (in chars)
3290 
3291   bind(SCAN_TO_8_CHAR_LOOP);
3292   movdqu(vec3, Address(result, 0));
3293   pcmpeqw(vec3, vec1);
3294   ptest(vec2, vec3);
3295   jcc(Assembler::carryClear, FOUND_CHAR);
3296   addptr(result, 16);
3297   subl(tmp, stride);
3298   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3299   bind(SCAN_TO_CHAR);
3300   testl(cnt1, cnt1);
3301   jcc(Assembler::zero, RET_NOT_FOUND);
3302   bind(SCAN_TO_CHAR_LOOP);
3303   load_unsigned_short(tmp, Address(result, 0));
3304   cmpl(ch, tmp);
3305   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3306   addptr(result, 2);
3307   subl(cnt1, 1);
3308   jccb(Assembler::zero, RET_NOT_FOUND);
3309   jmp(SCAN_TO_CHAR_LOOP);
3310 
3311   bind(RET_NOT_FOUND);
3312   movl(result, -1);
3313   jmpb(DONE_LABEL);
3314 
3315   bind(FOUND_CHAR);
3316   if (UseAVX >= 2) {
3317     vpmovmskb(tmp, vec3);
3318   } else {
3319     pmovmskb(tmp, vec3);
3320   }
3321   bsfl(ch, tmp);
3322   addptr(result, ch);
3323 
3324   bind(FOUND_SEQ_CHAR);
3325   subptr(result, str1);
3326   shrl(result, 1);
3327 
3328   bind(DONE_LABEL);
3329 } // string_indexof_char
3330 
3331 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3332                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3333   ShortBranchVerifier sbv(this);
3334   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3335 
3336   int stride = 16;
3337 
3338   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3339         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3340         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3341         FOUND_SEQ_CHAR, DONE_LABEL;
3342 
3343   movptr(result, str1);
3344   if (UseAVX >= 2) {
3345     cmpl(cnt1, stride);
3346     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3347     cmpl(cnt1, stride*2);
3348     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3349     movdl(vec1, ch);
3350     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3351     vpxor(vec2, vec2);
3352     movl(tmp, cnt1);
3353     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3354     andl(cnt1,0x0000001F);  //tail count (in chars)
3355 
3356     bind(SCAN_TO_32_CHAR_LOOP);
3357     vmovdqu(vec3, Address(result, 0));
3358     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3359     vptest(vec2, vec3);
3360     jcc(Assembler::carryClear, FOUND_CHAR);
3361     addptr(result, 32);
3362     subl(tmp, stride*2);
3363     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3364     jmp(SCAN_TO_16_CHAR);
3365 
3366     bind(SCAN_TO_16_CHAR_INIT);
3367     movdl(vec1, ch);
3368     pxor(vec2, vec2);
3369     pshufb(vec1, vec2);
3370   }
3371 
3372   bind(SCAN_TO_16_CHAR);
3373   cmpl(cnt1, stride);
3374   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3375   if (UseAVX < 2) {
3376     movdl(vec1, ch);
3377     pxor(vec2, vec2);
3378     pshufb(vec1, vec2);
3379   }
3380   movl(tmp, cnt1);
3381   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3382   andl(cnt1,0x0000000F);  //tail count (in bytes)
3383 
3384   bind(SCAN_TO_16_CHAR_LOOP);
3385   movdqu(vec3, Address(result, 0));
3386   pcmpeqb(vec3, vec1);
3387   ptest(vec2, vec3);
3388   jcc(Assembler::carryClear, FOUND_CHAR);
3389   addptr(result, 16);
3390   subl(tmp, stride);
3391   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3392 
3393   bind(SCAN_TO_CHAR_INIT);
3394   testl(cnt1, cnt1);
3395   jcc(Assembler::zero, RET_NOT_FOUND);
3396   bind(SCAN_TO_CHAR_LOOP);
3397   load_unsigned_byte(tmp, Address(result, 0));
3398   cmpl(ch, tmp);
3399   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3400   addptr(result, 1);
3401   subl(cnt1, 1);
3402   jccb(Assembler::zero, RET_NOT_FOUND);
3403   jmp(SCAN_TO_CHAR_LOOP);
3404 
3405   bind(RET_NOT_FOUND);
3406   movl(result, -1);
3407   jmpb(DONE_LABEL);
3408 
3409   bind(FOUND_CHAR);
3410   if (UseAVX >= 2) {
3411     vpmovmskb(tmp, vec3);
3412   } else {
3413     pmovmskb(tmp, vec3);
3414   }
3415   bsfl(ch, tmp);
3416   addptr(result, ch);
3417 
3418   bind(FOUND_SEQ_CHAR);
3419   subptr(result, str1);
3420 
3421   bind(DONE_LABEL);
3422 } // stringL_indexof_char
3423 
3424 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3425   switch (eltype) {
3426   case T_BOOLEAN: return sizeof(jboolean);
3427   case T_BYTE:  return sizeof(jbyte);
3428   case T_SHORT: return sizeof(jshort);
3429   case T_CHAR:  return sizeof(jchar);
3430   case T_INT:   return sizeof(jint);
3431   default:
3432     ShouldNotReachHere();
3433     return -1;
3434   }
3435 }
3436 
3437 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3438   switch (eltype) {
3439   // T_BOOLEAN used as surrogate for unsigned byte
3440   case T_BOOLEAN: movzbl(dst, src);   break;
3441   case T_BYTE:    movsbl(dst, src);   break;
3442   case T_SHORT:   movswl(dst, src);   break;
3443   case T_CHAR:    movzwl(dst, src);   break;
3444   case T_INT:     movl(dst, src);     break;
3445   default:
3446     ShouldNotReachHere();
3447   }
3448 }
3449 
3450 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3451   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3452 }
3453 
3454 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3455   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3456 }
3457 
3458 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3459   const int vlen = Assembler::AVX_256bit;
3460   switch (eltype) {
3461   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3462   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3463   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3464   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3465   case T_INT:
3466     // do nothing
3467     break;
3468   default:
3469     ShouldNotReachHere();
3470   }
3471 }
3472 
3473 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3474                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3475                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3476                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3477                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3478                                         BasicType eltype) {
3479   ShortBranchVerifier sbv(this);
3480   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3481   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3482   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3483 
3484   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3485         SHORT_UNROLLED_LOOP_EXIT,
3486         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3487         UNROLLED_VECTOR_LOOP_BEGIN,
3488         END;
3489   switch (eltype) {
3490   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3491   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3492   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3493   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3494   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3495   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3496   }
3497 
3498   // For "renaming" for readibility of the code
3499   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3500                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3501                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3502 
3503   const int elsize = arrays_hashcode_elsize(eltype);
3504 
3505   /*
3506     if (cnt1 >= 2) {
3507       if (cnt1 >= 32) {
3508         UNROLLED VECTOR LOOP
3509       }
3510       UNROLLED SCALAR LOOP
3511     }
3512     SINGLE SCALAR
3513    */
3514 
3515   cmpl(cnt1, 32);
3516   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3517 
3518   // cnt1 >= 32 && generate_vectorized_loop
3519   xorl(index, index);
3520 
3521   // vresult = IntVector.zero(I256);
3522   for (int idx = 0; idx < 4; idx++) {
3523     vpxor(vresult[idx], vresult[idx]);
3524   }
3525   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3526   Register bound = tmp2;
3527   Register next = tmp3;
3528   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3529   movl(next, Address(tmp2, 0));
3530   movdl(vnext, next);
3531   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3532 
3533   // index = 0;
3534   // bound = cnt1 & ~(32 - 1);
3535   movl(bound, cnt1);
3536   andl(bound, ~(32 - 1));
3537   // for (; index < bound; index += 32) {
3538   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3539   // result *= next;
3540   imull(result, next);
3541   // loop fission to upfront the cost of fetching from memory, OOO execution
3542   // can then hopefully do a better job of prefetching
3543   for (int idx = 0; idx < 4; idx++) {
3544     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3545   }
3546   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3547   for (int idx = 0; idx < 4; idx++) {
3548     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3549     arrays_hashcode_elvcast(vtmp[idx], eltype);
3550     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3551   }
3552   // index += 32;
3553   addl(index, 32);
3554   // index < bound;
3555   cmpl(index, bound);
3556   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3557   // }
3558 
3559   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3560   subl(cnt1, bound);
3561   // release bound
3562 
3563   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3564   for (int idx = 0; idx < 4; idx++) {
3565     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3566     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3567     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3568   }
3569   // result += vresult.reduceLanes(ADD);
3570   for (int idx = 0; idx < 4; idx++) {
3571     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3572   }
3573 
3574   // } else if (cnt1 < 32) {
3575 
3576   bind(SHORT_UNROLLED_BEGIN);
3577   // int i = 1;
3578   movl(index, 1);
3579   cmpl(index, cnt1);
3580   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3581 
3582   // for (; i < cnt1 ; i += 2) {
3583   bind(SHORT_UNROLLED_LOOP_BEGIN);
3584   movl(tmp3, 961);
3585   imull(result, tmp3);
3586   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3587   movl(tmp3, tmp2);
3588   shll(tmp3, 5);
3589   subl(tmp3, tmp2);
3590   addl(result, tmp3);
3591   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3592   addl(result, tmp3);
3593   addl(index, 2);
3594   cmpl(index, cnt1);
3595   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3596 
3597   // }
3598   // if (i >= cnt1) {
3599   bind(SHORT_UNROLLED_LOOP_EXIT);
3600   jccb(Assembler::greater, END);
3601   movl(tmp2, result);
3602   shll(result, 5);
3603   subl(result, tmp2);
3604   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3605   addl(result, tmp3);
3606   // }
3607   bind(END);
3608 
3609   BLOCK_COMMENT("} // arrays_hashcode");
3610 
3611 } // arrays_hashcode
3612 
3613 // helper function for string_compare
3614 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3615                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3616                                            Address::ScaleFactor scale2, Register index, int ae) {
3617   if (ae == StrIntrinsicNode::LL) {
3618     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3619     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3620   } else if (ae == StrIntrinsicNode::UU) {
3621     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3622     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3623   } else {
3624     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3625     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3626   }
3627 }
3628 
3629 // Compare strings, used for char[] and byte[].
3630 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3631                                        Register cnt1, Register cnt2, Register result,
3632                                        XMMRegister vec1, int ae, KRegister mask) {
3633   ShortBranchVerifier sbv(this);
3634   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3635   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only AVX3
3636   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3637   int stride2x2 = 0x40;
3638   Address::ScaleFactor scale = Address::no_scale;
3639   Address::ScaleFactor scale1 = Address::no_scale;
3640   Address::ScaleFactor scale2 = Address::no_scale;
3641 
3642   if (ae != StrIntrinsicNode::LL) {
3643     stride2x2 = 0x20;
3644   }
3645 
3646   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3647     shrl(cnt2, 1);
3648   }
3649   // Compute the minimum of the string lengths and the
3650   // difference of the string lengths (stack).
3651   // Do the conditional move stuff
3652   movl(result, cnt1);
3653   subl(cnt1, cnt2);
3654   push(cnt1);
3655   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3656 
3657   // Is the minimum length zero?
3658   testl(cnt2, cnt2);
3659   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3660   if (ae == StrIntrinsicNode::LL) {
3661     // Load first bytes
3662     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3663     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3664   } else if (ae == StrIntrinsicNode::UU) {
3665     // Load first characters
3666     load_unsigned_short(result, Address(str1, 0));
3667     load_unsigned_short(cnt1, Address(str2, 0));
3668   } else {
3669     load_unsigned_byte(result, Address(str1, 0));
3670     load_unsigned_short(cnt1, Address(str2, 0));
3671   }
3672   subl(result, cnt1);
3673   jcc(Assembler::notZero,  POP_LABEL);
3674 
3675   if (ae == StrIntrinsicNode::UU) {
3676     // Divide length by 2 to get number of chars
3677     shrl(cnt2, 1);
3678   }
3679   cmpl(cnt2, 1);
3680   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3681 
3682   // Check if the strings start at the same location and setup scale and stride
3683   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3684     cmpptr(str1, str2);
3685     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3686     if (ae == StrIntrinsicNode::LL) {
3687       scale = Address::times_1;
3688       stride = 16;
3689     } else {
3690       scale = Address::times_2;
3691       stride = 8;
3692     }
3693   } else {
3694     scale1 = Address::times_1;
3695     scale2 = Address::times_2;
3696     // scale not used
3697     stride = 8;
3698   }
3699 
3700   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3701     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3702     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3703     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3704     Label COMPARE_TAIL_LONG;
3705     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only AVX3
3706 
3707     int pcmpmask = 0x19;
3708     if (ae == StrIntrinsicNode::LL) {
3709       pcmpmask &= ~0x01;
3710     }
3711 
3712     // Setup to compare 16-chars (32-bytes) vectors,
3713     // start from first character again because it has aligned address.
3714     if (ae == StrIntrinsicNode::LL) {
3715       stride2 = 32;
3716     } else {
3717       stride2 = 16;
3718     }
3719     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3720       adr_stride = stride << scale;
3721     } else {
3722       adr_stride1 = 8;  //stride << scale1;
3723       adr_stride2 = 16; //stride << scale2;
3724     }
3725 
3726     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3727     // rax and rdx are used by pcmpestri as elements counters
3728     movl(result, cnt2);
3729     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3730     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3731 
3732     // fast path : compare first 2 8-char vectors.
3733     bind(COMPARE_16_CHARS);
3734     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3735       movdqu(vec1, Address(str1, 0));
3736     } else {
3737       pmovzxbw(vec1, Address(str1, 0));
3738     }
3739     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3740     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3741 
3742     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3743       movdqu(vec1, Address(str1, adr_stride));
3744       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3745     } else {
3746       pmovzxbw(vec1, Address(str1, adr_stride1));
3747       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3748     }
3749     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3750     addl(cnt1, stride);
3751 
3752     // Compare the characters at index in cnt1
3753     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3754     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3755     subl(result, cnt2);
3756     jmp(POP_LABEL);
3757 
3758     // Setup the registers to start vector comparison loop
3759     bind(COMPARE_WIDE_VECTORS);
3760     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3761       lea(str1, Address(str1, result, scale));
3762       lea(str2, Address(str2, result, scale));
3763     } else {
3764       lea(str1, Address(str1, result, scale1));
3765       lea(str2, Address(str2, result, scale2));
3766     }
3767     subl(result, stride2);
3768     subl(cnt2, stride2);
3769     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3770     negptr(result);
3771 
3772     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3773     bind(COMPARE_WIDE_VECTORS_LOOP);
3774 
3775     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3776       cmpl(cnt2, stride2x2);
3777       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3778       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3779       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3780 
3781       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3782       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3783         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3784         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3785       } else {
3786         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3787         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3788       }
3789       kortestql(mask, mask);
3790       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3791       addptr(result, stride2x2);  // update since we already compared at this addr
3792       subl(cnt2, stride2x2);      // and sub the size too
3793       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3794 
3795       vpxor(vec1, vec1);
3796       jmpb(COMPARE_WIDE_TAIL);
3797     }//if (VM_Version::supports_avx512vlbw())
3798 
3799     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3800     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3801       vmovdqu(vec1, Address(str1, result, scale));
3802       vpxor(vec1, Address(str2, result, scale));
3803     } else {
3804       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3805       vpxor(vec1, Address(str2, result, scale2));
3806     }
3807     vptest(vec1, vec1);
3808     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3809     addptr(result, stride2);
3810     subl(cnt2, stride2);
3811     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3812     // clean upper bits of YMM registers
3813     vpxor(vec1, vec1);
3814 
3815     // compare wide vectors tail
3816     bind(COMPARE_WIDE_TAIL);
3817     testptr(result, result);
3818     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3819 
3820     movl(result, stride2);
3821     movl(cnt2, result);
3822     negptr(result);
3823     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3824 
3825     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3826     bind(VECTOR_NOT_EQUAL);
3827     // clean upper bits of YMM registers
3828     vpxor(vec1, vec1);
3829     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3830       lea(str1, Address(str1, result, scale));
3831       lea(str2, Address(str2, result, scale));
3832     } else {
3833       lea(str1, Address(str1, result, scale1));
3834       lea(str2, Address(str2, result, scale2));
3835     }
3836     jmp(COMPARE_16_CHARS);
3837 
3838     // Compare tail chars, length between 1 to 15 chars
3839     bind(COMPARE_TAIL_LONG);
3840     movl(cnt2, result);
3841     cmpl(cnt2, stride);
3842     jcc(Assembler::less, COMPARE_SMALL_STR);
3843 
3844     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3845       movdqu(vec1, Address(str1, 0));
3846     } else {
3847       pmovzxbw(vec1, Address(str1, 0));
3848     }
3849     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3850     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3851     subptr(cnt2, stride);
3852     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3853     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3854       lea(str1, Address(str1, result, scale));
3855       lea(str2, Address(str2, result, scale));
3856     } else {
3857       lea(str1, Address(str1, result, scale1));
3858       lea(str2, Address(str2, result, scale2));
3859     }
3860     negptr(cnt2);
3861     jmpb(WHILE_HEAD_LABEL);
3862 
3863     bind(COMPARE_SMALL_STR);
3864   } else if (UseSSE42Intrinsics) {
3865     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3866     int pcmpmask = 0x19;
3867     // Setup to compare 8-char (16-byte) vectors,
3868     // start from first character again because it has aligned address.
3869     movl(result, cnt2);
3870     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3871     if (ae == StrIntrinsicNode::LL) {
3872       pcmpmask &= ~0x01;
3873     }
3874     jcc(Assembler::zero, COMPARE_TAIL);
3875     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3876       lea(str1, Address(str1, result, scale));
3877       lea(str2, Address(str2, result, scale));
3878     } else {
3879       lea(str1, Address(str1, result, scale1));
3880       lea(str2, Address(str2, result, scale2));
3881     }
3882     negptr(result);
3883 
3884     // pcmpestri
3885     //   inputs:
3886     //     vec1- substring
3887     //     rax - negative string length (elements count)
3888     //     mem - scanned string
3889     //     rdx - string length (elements count)
3890     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3891     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3892     //   outputs:
3893     //     rcx - first mismatched element index
3894     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3895 
3896     bind(COMPARE_WIDE_VECTORS);
3897     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3898       movdqu(vec1, Address(str1, result, scale));
3899       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3900     } else {
3901       pmovzxbw(vec1, Address(str1, result, scale1));
3902       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3903     }
3904     // After pcmpestri cnt1(rcx) contains mismatched element index
3905 
3906     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3907     addptr(result, stride);
3908     subptr(cnt2, stride);
3909     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3910 
3911     // compare wide vectors tail
3912     testptr(result, result);
3913     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3914 
3915     movl(cnt2, stride);
3916     movl(result, stride);
3917     negptr(result);
3918     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3919       movdqu(vec1, Address(str1, result, scale));
3920       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3921     } else {
3922       pmovzxbw(vec1, Address(str1, result, scale1));
3923       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3924     }
3925     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3926 
3927     // Mismatched characters in the vectors
3928     bind(VECTOR_NOT_EQUAL);
3929     addptr(cnt1, result);
3930     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3931     subl(result, cnt2);
3932     jmpb(POP_LABEL);
3933 
3934     bind(COMPARE_TAIL); // limit is zero
3935     movl(cnt2, result);
3936     // Fallthru to tail compare
3937   }
3938   // Shift str2 and str1 to the end of the arrays, negate min
3939   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3940     lea(str1, Address(str1, cnt2, scale));
3941     lea(str2, Address(str2, cnt2, scale));
3942   } else {
3943     lea(str1, Address(str1, cnt2, scale1));
3944     lea(str2, Address(str2, cnt2, scale2));
3945   }
3946   decrementl(cnt2);  // first character was compared already
3947   negptr(cnt2);
3948 
3949   // Compare the rest of the elements
3950   bind(WHILE_HEAD_LABEL);
3951   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3952   subl(result, cnt1);
3953   jccb(Assembler::notZero, POP_LABEL);
3954   increment(cnt2);
3955   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3956 
3957   // Strings are equal up to min length.  Return the length difference.
3958   bind(LENGTH_DIFF_LABEL);
3959   pop(result);
3960   if (ae == StrIntrinsicNode::UU) {
3961     // Divide diff by 2 to get number of chars
3962     sarl(result, 1);
3963   }
3964   jmpb(DONE_LABEL);
3965 
3966   if (VM_Version::supports_avx512vlbw()) {
3967 
3968     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3969 
3970     kmovql(cnt1, mask);
3971     notq(cnt1);
3972     bsfq(cnt2, cnt1);
3973     if (ae != StrIntrinsicNode::LL) {
3974       // Divide diff by 2 to get number of chars
3975       sarl(cnt2, 1);
3976     }
3977     addq(result, cnt2);
3978     if (ae == StrIntrinsicNode::LL) {
3979       load_unsigned_byte(cnt1, Address(str2, result));
3980       load_unsigned_byte(result, Address(str1, result));
3981     } else if (ae == StrIntrinsicNode::UU) {
3982       load_unsigned_short(cnt1, Address(str2, result, scale));
3983       load_unsigned_short(result, Address(str1, result, scale));
3984     } else {
3985       load_unsigned_short(cnt1, Address(str2, result, scale2));
3986       load_unsigned_byte(result, Address(str1, result, scale1));
3987     }
3988     subl(result, cnt1);
3989     jmpb(POP_LABEL);
3990   }//if (VM_Version::supports_avx512vlbw())
3991 
3992   // Discard the stored length difference
3993   bind(POP_LABEL);
3994   pop(cnt1);
3995 
3996   // That's it
3997   bind(DONE_LABEL);
3998   if(ae == StrIntrinsicNode::UL) {
3999     negl(result);
4000   }
4001 
4002 }
4003 
4004 // Search for Non-ASCII character (Negative byte value) in a byte array,
4005 // return the index of the first such character, otherwise the length
4006 // of the array segment searched.
4007 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4008 //   @IntrinsicCandidate
4009 //   public static int countPositives(byte[] ba, int off, int len) {
4010 //     for (int i = off; i < off + len; i++) {
4011 //       if (ba[i] < 0) {
4012 //         return i - off;
4013 //       }
4014 //     }
4015 //     return len;
4016 //   }
4017 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4018   Register result, Register tmp1,
4019   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4020   // rsi: byte array
4021   // rcx: len
4022   // rax: result
4023   ShortBranchVerifier sbv(this);
4024   assert_different_registers(ary1, len, result, tmp1);
4025   assert_different_registers(vec1, vec2);
4026   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4027 
4028   movl(result, len); // copy
4029   // len == 0
4030   testl(len, len);
4031   jcc(Assembler::zero, DONE);
4032 
4033   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4034     VM_Version::supports_avx512vlbw() &&
4035     VM_Version::supports_bmi2()) {
4036 
4037     Label test_64_loop, test_tail, BREAK_LOOP;
4038     movl(tmp1, len);
4039     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4040 
4041     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4042     andl(len,  0xffffffc0); // vector count (in chars)
4043     jccb(Assembler::zero, test_tail);
4044 
4045     lea(ary1, Address(ary1, len, Address::times_1));
4046     negptr(len);
4047 
4048     bind(test_64_loop);
4049     // Check whether our 64 elements of size byte contain negatives
4050     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4051     kortestql(mask1, mask1);
4052     jcc(Assembler::notZero, BREAK_LOOP);
4053 
4054     addptr(len, 64);
4055     jccb(Assembler::notZero, test_64_loop);
4056 
4057     bind(test_tail);
4058     // bail out when there is nothing to be done
4059     testl(tmp1, -1);
4060     jcc(Assembler::zero, DONE);
4061 
4062 
4063     // check the tail for absense of negatives
4064     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4065     {
4066       Register tmp3_aliased = len;
4067       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4068       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4069       notq(tmp3_aliased);
4070       kmovql(mask2, tmp3_aliased);
4071     }
4072 
4073     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4074     ktestq(mask1, mask2);
4075     jcc(Assembler::zero, DONE);
4076 
4077     // do a full check for negative registers in the tail
4078     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4079                      // ary1 already pointing to the right place
4080     jmpb(TAIL_START);
4081 
4082     bind(BREAK_LOOP);
4083     // At least one byte in the last 64 byte block was negative.
4084     // Set up to look at the last 64 bytes as if they were a tail
4085     lea(ary1, Address(ary1, len, Address::times_1));
4086     addptr(result, len);
4087     // Ignore the very last byte: if all others are positive,
4088     // it must be negative, so we can skip right to the 2+1 byte
4089     // end comparison at this point
4090     orl(result, 63);
4091     movl(len, 63);
4092     // Fallthru to tail compare
4093   } else {
4094 
4095     if (UseAVX >= 2) {
4096       // With AVX2, use 32-byte vector compare
4097       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4098 
4099       // Compare 32-byte vectors
4100       testl(len, 0xffffffe0);   // vector count (in bytes)
4101       jccb(Assembler::zero, TAIL_START);
4102 
4103       andl(len, 0xffffffe0);
4104       lea(ary1, Address(ary1, len, Address::times_1));
4105       negptr(len);
4106 
4107       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4108       movdl(vec2, tmp1);
4109       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4110 
4111       bind(COMPARE_WIDE_VECTORS);
4112       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4113       vptest(vec1, vec2);
4114       jccb(Assembler::notZero, BREAK_LOOP);
4115       addptr(len, 32);
4116       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4117 
4118       testl(result, 0x0000001f);   // any bytes remaining?
4119       jcc(Assembler::zero, DONE);
4120 
4121       // Quick test using the already prepared vector mask
4122       movl(len, result);
4123       andl(len, 0x0000001f);
4124       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4125       vptest(vec1, vec2);
4126       jcc(Assembler::zero, DONE);
4127       // There are zeros, jump to the tail to determine exactly where
4128       jmpb(TAIL_START);
4129 
4130       bind(BREAK_LOOP);
4131       // At least one byte in the last 32-byte vector is negative.
4132       // Set up to look at the last 32 bytes as if they were a tail
4133       lea(ary1, Address(ary1, len, Address::times_1));
4134       addptr(result, len);
4135       // Ignore the very last byte: if all others are positive,
4136       // it must be negative, so we can skip right to the 2+1 byte
4137       // end comparison at this point
4138       orl(result, 31);
4139       movl(len, 31);
4140       // Fallthru to tail compare
4141     } else if (UseSSE42Intrinsics) {
4142       // With SSE4.2, use double quad vector compare
4143       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4144 
4145       // Compare 16-byte vectors
4146       testl(len, 0xfffffff0);   // vector count (in bytes)
4147       jcc(Assembler::zero, TAIL_START);
4148 
4149       andl(len, 0xfffffff0);
4150       lea(ary1, Address(ary1, len, Address::times_1));
4151       negptr(len);
4152 
4153       movl(tmp1, 0x80808080);
4154       movdl(vec2, tmp1);
4155       pshufd(vec2, vec2, 0);
4156 
4157       bind(COMPARE_WIDE_VECTORS);
4158       movdqu(vec1, Address(ary1, len, Address::times_1));
4159       ptest(vec1, vec2);
4160       jccb(Assembler::notZero, BREAK_LOOP);
4161       addptr(len, 16);
4162       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4163 
4164       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4165       jcc(Assembler::zero, DONE);
4166 
4167       // Quick test using the already prepared vector mask
4168       movl(len, result);
4169       andl(len, 0x0000000f);   // tail count (in bytes)
4170       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4171       ptest(vec1, vec2);
4172       jcc(Assembler::zero, DONE);
4173       jmpb(TAIL_START);
4174 
4175       bind(BREAK_LOOP);
4176       // At least one byte in the last 16-byte vector is negative.
4177       // Set up and look at the last 16 bytes as if they were a tail
4178       lea(ary1, Address(ary1, len, Address::times_1));
4179       addptr(result, len);
4180       // Ignore the very last byte: if all others are positive,
4181       // it must be negative, so we can skip right to the 2+1 byte
4182       // end comparison at this point
4183       orl(result, 15);
4184       movl(len, 15);
4185       // Fallthru to tail compare
4186     }
4187   }
4188 
4189   bind(TAIL_START);
4190   // Compare 4-byte vectors
4191   andl(len, 0xfffffffc); // vector count (in bytes)
4192   jccb(Assembler::zero, COMPARE_CHAR);
4193 
4194   lea(ary1, Address(ary1, len, Address::times_1));
4195   negptr(len);
4196 
4197   bind(COMPARE_VECTORS);
4198   movl(tmp1, Address(ary1, len, Address::times_1));
4199   andl(tmp1, 0x80808080);
4200   jccb(Assembler::notZero, TAIL_ADJUST);
4201   addptr(len, 4);
4202   jccb(Assembler::notZero, COMPARE_VECTORS);
4203 
4204   // Compare trailing char (final 2-3 bytes), if any
4205   bind(COMPARE_CHAR);
4206 
4207   testl(result, 0x2);   // tail  char
4208   jccb(Assembler::zero, COMPARE_BYTE);
4209   load_unsigned_short(tmp1, Address(ary1, 0));
4210   andl(tmp1, 0x00008080);
4211   jccb(Assembler::notZero, CHAR_ADJUST);
4212   lea(ary1, Address(ary1, 2));
4213 
4214   bind(COMPARE_BYTE);
4215   testl(result, 0x1);   // tail  byte
4216   jccb(Assembler::zero, DONE);
4217   load_unsigned_byte(tmp1, Address(ary1, 0));
4218   testl(tmp1, 0x00000080);
4219   jccb(Assembler::zero, DONE);
4220   subptr(result, 1);
4221   jmpb(DONE);
4222 
4223   bind(TAIL_ADJUST);
4224   // there are negative bits in the last 4 byte block.
4225   // Adjust result and check the next three bytes
4226   addptr(result, len);
4227   orl(result, 3);
4228   lea(ary1, Address(ary1, len, Address::times_1));
4229   jmpb(COMPARE_CHAR);
4230 
4231   bind(CHAR_ADJUST);
4232   // We are looking at a char + optional byte tail, and found that one
4233   // of the bytes in the char is negative. Adjust the result, check the
4234   // first byte and readjust if needed.
4235   andl(result, 0xfffffffc);
4236   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4237   jccb(Assembler::notZero, DONE);
4238   addptr(result, 1);
4239 
4240   // That's it
4241   bind(DONE);
4242   if (UseAVX >= 2) {
4243     // clean upper bits of YMM registers
4244     vpxor(vec1, vec1);
4245     vpxor(vec2, vec2);
4246   }
4247 }
4248 
4249 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4250 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4251                                       Register limit, Register result, Register chr,
4252                                       XMMRegister vec1, XMMRegister vec2, bool is_char,
4253                                       KRegister mask, bool expand_ary2) {
4254   // for expand_ary2, limit is the (smaller) size of the second array.
4255   ShortBranchVerifier sbv(this);
4256   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4257 
4258   assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4259          "Expansion only implemented for AVX2");
4260 
4261   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4262   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4263 
4264   Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4265   int scaleIncr = expand_ary2 ? 8 : 16;
4266 
4267   if (is_array_equ) {
4268     // Check the input args
4269     cmpoop(ary1, ary2);
4270     jcc(Assembler::equal, TRUE_LABEL);
4271 
4272     // Need additional checks for arrays_equals.
4273     testptr(ary1, ary1);
4274     jcc(Assembler::zero, FALSE_LABEL);
4275     testptr(ary2, ary2);
4276     jcc(Assembler::zero, FALSE_LABEL);
4277 
4278     // Check the lengths
4279     movl(limit, Address(ary1, length_offset));
4280     cmpl(limit, Address(ary2, length_offset));
4281     jcc(Assembler::notEqual, FALSE_LABEL);
4282   }
4283 
4284   // count == 0
4285   testl(limit, limit);
4286   jcc(Assembler::zero, TRUE_LABEL);
4287 
4288   if (is_array_equ) {
4289     // Load array address
4290     lea(ary1, Address(ary1, base_offset));
4291     lea(ary2, Address(ary2, base_offset));
4292   }
4293 
4294   if (is_array_equ && is_char) {
4295     // arrays_equals when used for char[].
4296     shll(limit, 1);      // byte count != 0
4297   }
4298   movl(result, limit); // copy
4299 
4300   if (UseAVX >= 2) {
4301     // With AVX2, use 32-byte vector compare
4302     Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4303 
4304     // Compare 32-byte vectors
4305     if (expand_ary2) {
4306       andl(result, 0x0000000f);  //   tail count (in bytes)
4307       andl(limit, 0xfffffff0);   // vector count (in bytes)
4308       jcc(Assembler::zero, COMPARE_TAIL);
4309     } else {
4310       andl(result, 0x0000001f);  //   tail count (in bytes)
4311       andl(limit, 0xffffffe0);   // vector count (in bytes)
4312       jcc(Assembler::zero, COMPARE_TAIL_16);
4313     }
4314 
4315     lea(ary1, Address(ary1, limit, scaleFactor));
4316     lea(ary2, Address(ary2, limit, Address::times_1));
4317     negptr(limit);
4318 
4319     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4320       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4321 
4322       cmpl(limit, -64);
4323       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4324 
4325       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4326 
4327       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4328       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4329       kortestql(mask, mask);
4330       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4331       addptr(limit, 64);  // update since we already compared at this addr
4332       cmpl(limit, -64);
4333       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4334 
4335       // At this point we may still need to compare -limit+result bytes.
4336       // We could execute the next two instruction and just continue via non-wide path:
4337       //  cmpl(limit, 0);
4338       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4339       // But since we stopped at the points ary{1,2}+limit which are
4340       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4341       // (|limit| <= 32 and result < 32),
4342       // we may just compare the last 64 bytes.
4343       //
4344       addptr(result, -64);   // it is safe, bc we just came from this area
4345       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4346       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4347       kortestql(mask, mask);
4348       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4349 
4350       jmp(TRUE_LABEL);
4351 
4352       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4353 
4354     }//if (VM_Version::supports_avx512vlbw())
4355 
4356     bind(COMPARE_WIDE_VECTORS);
4357     vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4358     if (expand_ary2) {
4359       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4360     } else {
4361       vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4362     }
4363     vpxor(vec1, vec2);
4364 
4365     vptest(vec1, vec1);
4366     jcc(Assembler::notZero, FALSE_LABEL);
4367     addptr(limit, scaleIncr * 2);
4368     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4369 
4370     testl(result, result);
4371     jcc(Assembler::zero, TRUE_LABEL);
4372 
4373     vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4374     if (expand_ary2) {
4375       vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4376     } else {
4377       vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4378     }
4379     vpxor(vec1, vec2);
4380 
4381     vptest(vec1, vec1);
4382     jcc(Assembler::notZero, FALSE_LABEL);
4383     jmp(TRUE_LABEL);
4384 
4385     bind(COMPARE_TAIL_16); // limit is zero
4386     movl(limit, result);
4387 
4388     // Compare 16-byte chunks
4389     andl(result, 0x0000000f);  //   tail count (in bytes)
4390     andl(limit, 0xfffffff0);   // vector count (in bytes)
4391     jcc(Assembler::zero, COMPARE_TAIL);
4392 
4393     lea(ary1, Address(ary1, limit, scaleFactor));
4394     lea(ary2, Address(ary2, limit, Address::times_1));
4395     negptr(limit);
4396 
4397     bind(COMPARE_WIDE_VECTORS_16);
4398     movdqu(vec1, Address(ary1, limit, scaleFactor));
4399     if (expand_ary2) {
4400       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4401     } else {
4402       movdqu(vec2, Address(ary2, limit, Address::times_1));
4403     }
4404     pxor(vec1, vec2);
4405 
4406     ptest(vec1, vec1);
4407     jcc(Assembler::notZero, FALSE_LABEL);
4408     addptr(limit, scaleIncr);
4409     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4410 
4411     bind(COMPARE_TAIL); // limit is zero
4412     movl(limit, result);
4413     // Fallthru to tail compare
4414   } else if (UseSSE42Intrinsics) {
4415     // With SSE4.2, use double quad vector compare
4416     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4417 
4418     // Compare 16-byte vectors
4419     andl(result, 0x0000000f);  //   tail count (in bytes)
4420     andl(limit, 0xfffffff0);   // vector count (in bytes)
4421     jcc(Assembler::zero, COMPARE_TAIL);
4422 
4423     lea(ary1, Address(ary1, limit, Address::times_1));
4424     lea(ary2, Address(ary2, limit, Address::times_1));
4425     negptr(limit);
4426 
4427     bind(COMPARE_WIDE_VECTORS);
4428     movdqu(vec1, Address(ary1, limit, Address::times_1));
4429     movdqu(vec2, Address(ary2, limit, Address::times_1));
4430     pxor(vec1, vec2);
4431 
4432     ptest(vec1, vec1);
4433     jcc(Assembler::notZero, FALSE_LABEL);
4434     addptr(limit, 16);
4435     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4436 
4437     testl(result, result);
4438     jcc(Assembler::zero, TRUE_LABEL);
4439 
4440     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4441     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4442     pxor(vec1, vec2);
4443 
4444     ptest(vec1, vec1);
4445     jccb(Assembler::notZero, FALSE_LABEL);
4446     jmpb(TRUE_LABEL);
4447 
4448     bind(COMPARE_TAIL); // limit is zero
4449     movl(limit, result);
4450     // Fallthru to tail compare
4451   }
4452 
4453   // Compare 4-byte vectors
4454   if (expand_ary2) {
4455     testl(result, result);
4456     jccb(Assembler::zero, TRUE_LABEL);
4457   } else {
4458     andl(limit, 0xfffffffc); // vector count (in bytes)
4459     jccb(Assembler::zero, COMPARE_CHAR);
4460   }
4461 
4462   lea(ary1, Address(ary1, limit, scaleFactor));
4463   lea(ary2, Address(ary2, limit, Address::times_1));
4464   negptr(limit);
4465 
4466   bind(COMPARE_VECTORS);
4467   if (expand_ary2) {
4468     // There are no "vector" operations for bytes to shorts
4469     movzbl(chr, Address(ary2, limit, Address::times_1));
4470     cmpw(Address(ary1, limit, Address::times_2), chr);
4471     jccb(Assembler::notEqual, FALSE_LABEL);
4472     addptr(limit, 1);
4473     jcc(Assembler::notZero, COMPARE_VECTORS);
4474     jmp(TRUE_LABEL);
4475   } else {
4476     movl(chr, Address(ary1, limit, Address::times_1));
4477     cmpl(chr, Address(ary2, limit, Address::times_1));
4478     jccb(Assembler::notEqual, FALSE_LABEL);
4479     addptr(limit, 4);
4480     jcc(Assembler::notZero, COMPARE_VECTORS);
4481   }
4482 
4483   // Compare trailing char (final 2 bytes), if any
4484   bind(COMPARE_CHAR);
4485   testl(result, 0x2);   // tail  char
4486   jccb(Assembler::zero, COMPARE_BYTE);
4487   load_unsigned_short(chr, Address(ary1, 0));
4488   load_unsigned_short(limit, Address(ary2, 0));
4489   cmpl(chr, limit);
4490   jccb(Assembler::notEqual, FALSE_LABEL);
4491 
4492   if (is_array_equ && is_char) {
4493     bind(COMPARE_BYTE);
4494   } else {
4495     lea(ary1, Address(ary1, 2));
4496     lea(ary2, Address(ary2, 2));
4497 
4498     bind(COMPARE_BYTE);
4499     testl(result, 0x1);   // tail  byte
4500     jccb(Assembler::zero, TRUE_LABEL);
4501     load_unsigned_byte(chr, Address(ary1, 0));
4502     load_unsigned_byte(limit, Address(ary2, 0));
4503     cmpl(chr, limit);
4504     jccb(Assembler::notEqual, FALSE_LABEL);
4505   }
4506   bind(TRUE_LABEL);
4507   movl(result, 1);   // return true
4508   jmpb(DONE);
4509 
4510   bind(FALSE_LABEL);
4511   xorl(result, result); // return false
4512 
4513   // That's it
4514   bind(DONE);
4515   if (UseAVX >= 2) {
4516     // clean upper bits of YMM registers
4517     vpxor(vec1, vec1);
4518     vpxor(vec2, vec2);
4519   }
4520 }
4521 
4522 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4523 #define __ masm.
4524   Register dst = stub.data<0>();
4525   XMMRegister src = stub.data<1>();
4526   address target = stub.data<2>();
4527   __ bind(stub.entry());
4528   __ subptr(rsp, 8);
4529   __ movdbl(Address(rsp), src);
4530   __ call(RuntimeAddress(target));
4531   // APX REX2 encoding for pop(dst) increases the stub size by 1 byte.
4532   __ pop(dst);
4533   __ jmp(stub.continuation());
4534 #undef __
4535 }
4536 
4537 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4538   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4539   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4540 
4541   address slowpath_target;
4542   if (dst_bt == T_INT) {
4543     if (src_bt == T_FLOAT) {
4544       cvttss2sil(dst, src);
4545       cmpl(dst, 0x80000000);
4546       slowpath_target = StubRoutines::x86::f2i_fixup();
4547     } else {
4548       cvttsd2sil(dst, src);
4549       cmpl(dst, 0x80000000);
4550       slowpath_target = StubRoutines::x86::d2i_fixup();
4551     }
4552   } else {
4553     if (src_bt == T_FLOAT) {
4554       cvttss2siq(dst, src);
4555       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4556       slowpath_target = StubRoutines::x86::f2l_fixup();
4557     } else {
4558       cvttsd2siq(dst, src);
4559       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4560       slowpath_target = StubRoutines::x86::d2l_fixup();
4561     }
4562   }
4563 
4564   // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte.
4565   int max_size = 23 + (UseAPX ? 1 : 0);
4566   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath);
4567   jcc(Assembler::equal, stub->entry());
4568   bind(stub->continuation());
4569 }
4570 
4571 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4572                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4573   switch(ideal_opc) {
4574     case Op_LShiftVS:
4575       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4576     case Op_LShiftVI:
4577       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4578     case Op_LShiftVL:
4579       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4580     case Op_RShiftVS:
4581       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4582     case Op_RShiftVI:
4583       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4584     case Op_RShiftVL:
4585       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4586     case Op_URShiftVS:
4587       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4588     case Op_URShiftVI:
4589       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4590     case Op_URShiftVL:
4591       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4592     case Op_RotateRightV:
4593       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4594     case Op_RotateLeftV:
4595       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4596     default:
4597       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4598       break;
4599   }
4600 }
4601 
4602 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4603                                                XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4604   if (is_unsigned) {
4605     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4606   } else {
4607     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4608   }
4609 }
4610 
4611 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4612                                                       XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4613   switch (elem_bt) {
4614     case T_BYTE:
4615       if (ideal_opc == Op_SaturatingAddV) {
4616         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4617       } else {
4618         assert(ideal_opc == Op_SaturatingSubV, "");
4619         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4620       }
4621       break;
4622     case T_SHORT:
4623       if (ideal_opc == Op_SaturatingAddV) {
4624         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4625       } else {
4626         assert(ideal_opc == Op_SaturatingSubV, "");
4627         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4628       }
4629       break;
4630     default:
4631       fatal("Unsupported type %s", type2name(elem_bt));
4632       break;
4633   }
4634 }
4635 
4636 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4637                                                         XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4638   switch (elem_bt) {
4639     case T_BYTE:
4640       if (ideal_opc == Op_SaturatingAddV) {
4641         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4642       } else {
4643         assert(ideal_opc == Op_SaturatingSubV, "");
4644         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4645       }
4646       break;
4647     case T_SHORT:
4648       if (ideal_opc == Op_SaturatingAddV) {
4649         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4650       } else {
4651         assert(ideal_opc == Op_SaturatingSubV, "");
4652         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4653       }
4654       break;
4655     default:
4656       fatal("Unsupported type %s", type2name(elem_bt));
4657       break;
4658   }
4659 }
4660 
4661 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4662                                                Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4663   if (is_unsigned) {
4664     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4665   } else {
4666     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4667   }
4668 }
4669 
4670 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4671                                                       XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4672   switch (elem_bt) {
4673     case T_BYTE:
4674       if (ideal_opc == Op_SaturatingAddV) {
4675         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4676       } else {
4677         assert(ideal_opc == Op_SaturatingSubV, "");
4678         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4679       }
4680       break;
4681     case T_SHORT:
4682       if (ideal_opc == Op_SaturatingAddV) {
4683         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4684       } else {
4685         assert(ideal_opc == Op_SaturatingSubV, "");
4686         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4687       }
4688       break;
4689     default:
4690       fatal("Unsupported type %s", type2name(elem_bt));
4691       break;
4692   }
4693 }
4694 
4695 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4696                                                         XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4697   switch (elem_bt) {
4698     case T_BYTE:
4699       if (ideal_opc == Op_SaturatingAddV) {
4700         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4701       } else {
4702         assert(ideal_opc == Op_SaturatingSubV, "");
4703         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4704       }
4705       break;
4706     case T_SHORT:
4707       if (ideal_opc == Op_SaturatingAddV) {
4708         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4709       } else {
4710         assert(ideal_opc == Op_SaturatingSubV, "");
4711         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4712       }
4713       break;
4714     default:
4715       fatal("Unsupported type %s", type2name(elem_bt));
4716       break;
4717   }
4718 }
4719 
4720 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4721                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4722                                     bool is_varshift) {
4723   switch (ideal_opc) {
4724     case Op_AddVB:
4725       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4726     case Op_AddVS:
4727       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4728     case Op_AddVI:
4729       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4730     case Op_AddVL:
4731       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4732     case Op_AddVF:
4733       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4734     case Op_AddVD:
4735       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4736     case Op_SubVB:
4737       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4738     case Op_SubVS:
4739       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4740     case Op_SubVI:
4741       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4742     case Op_SubVL:
4743       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4744     case Op_SubVF:
4745       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4746     case Op_SubVD:
4747       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4748     case Op_MulVS:
4749       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4750     case Op_MulVI:
4751       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4752     case Op_MulVL:
4753       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4754     case Op_MulVF:
4755       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4756     case Op_MulVD:
4757       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4758     case Op_DivVF:
4759       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4760     case Op_DivVD:
4761       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4762     case Op_SqrtVF:
4763       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4764     case Op_SqrtVD:
4765       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4766     case Op_AbsVB:
4767       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4768     case Op_AbsVS:
4769       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4770     case Op_AbsVI:
4771       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4772     case Op_AbsVL:
4773       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4774     case Op_FmaVF:
4775       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4776     case Op_FmaVD:
4777       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4778     case Op_VectorRearrange:
4779       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4780     case Op_LShiftVS:
4781       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4782     case Op_LShiftVI:
4783       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4784     case Op_LShiftVL:
4785       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4786     case Op_RShiftVS:
4787       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4788     case Op_RShiftVI:
4789       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4790     case Op_RShiftVL:
4791       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4792     case Op_URShiftVS:
4793       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4794     case Op_URShiftVI:
4795       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4796     case Op_URShiftVL:
4797       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4798     case Op_RotateLeftV:
4799       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4800     case Op_RotateRightV:
4801       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4802     case Op_MaxV:
4803       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4804     case Op_MinV:
4805       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4806     case Op_UMinV:
4807       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4808     case Op_UMaxV:
4809       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4810     case Op_XorV:
4811       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4812     case Op_OrV:
4813       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4814     case Op_AndV:
4815       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4816     default:
4817       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4818       break;
4819   }
4820 }
4821 
4822 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4823                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4824   switch (ideal_opc) {
4825     case Op_AddVB:
4826       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4827     case Op_AddVS:
4828       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4829     case Op_AddVI:
4830       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4831     case Op_AddVL:
4832       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4833     case Op_AddVF:
4834       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4835     case Op_AddVD:
4836       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4837     case Op_SubVB:
4838       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4839     case Op_SubVS:
4840       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4841     case Op_SubVI:
4842       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4843     case Op_SubVL:
4844       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4845     case Op_SubVF:
4846       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4847     case Op_SubVD:
4848       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4849     case Op_MulVS:
4850       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4851     case Op_MulVI:
4852       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4853     case Op_MulVL:
4854       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4855     case Op_MulVF:
4856       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4857     case Op_MulVD:
4858       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4859     case Op_DivVF:
4860       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4861     case Op_DivVD:
4862       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4863     case Op_FmaVF:
4864       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4865     case Op_FmaVD:
4866       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4867     case Op_MaxV:
4868       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4869     case Op_MinV:
4870       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4871     case Op_UMaxV:
4872       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4873     case Op_UMinV:
4874       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4875     case Op_XorV:
4876       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4877     case Op_OrV:
4878       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4879     case Op_AndV:
4880       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4881     default:
4882       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4883       break;
4884   }
4885 }
4886 
4887 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4888                                   KRegister src1, KRegister src2) {
4889   BasicType etype = T_ILLEGAL;
4890   switch(mask_len) {
4891     case 2:
4892     case 4:
4893     case 8:  etype = T_BYTE; break;
4894     case 16: etype = T_SHORT; break;
4895     case 32: etype = T_INT; break;
4896     case 64: etype = T_LONG; break;
4897     default: fatal("Unsupported type"); break;
4898   }
4899   assert(etype != T_ILLEGAL, "");
4900   switch(ideal_opc) {
4901     case Op_AndVMask:
4902       kand(etype, dst, src1, src2); break;
4903     case Op_OrVMask:
4904       kor(etype, dst, src1, src2); break;
4905     case Op_XorVMask:
4906       kxor(etype, dst, src1, src2); break;
4907     default:
4908       fatal("Unsupported masked operation"); break;
4909   }
4910 }
4911 
4912 /*
4913  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4914  * If src is NaN, the result is 0.
4915  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4916  * the result is equal to the value of Integer.MIN_VALUE.
4917  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4918  * the result is equal to the value of Integer.MAX_VALUE.
4919  */
4920 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4921                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4922                                                                    Register rscratch, AddressLiteral float_sign_flip,
4923                                                                    int vec_enc) {
4924   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4925   Label done;
4926   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4927   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4928   vptest(xtmp2, xtmp2, vec_enc);
4929   jccb(Assembler::equal, done);
4930 
4931   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4932   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4933 
4934   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4935   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4936   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4937 
4938   // Recompute the mask for remaining special value.
4939   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4940   // Extract SRC values corresponding to TRUE mask lanes.
4941   vpand(xtmp4, xtmp2, src, vec_enc);
4942   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4943   // values are set.
4944   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4945 
4946   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4947   bind(done);
4948 }
4949 
4950 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4951                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4952                                                                     Register rscratch, AddressLiteral float_sign_flip,
4953                                                                     int vec_enc) {
4954   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4955   Label done;
4956   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4957   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4958   kortestwl(ktmp1, ktmp1);
4959   jccb(Assembler::equal, done);
4960 
4961   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4962   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4963   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4964 
4965   kxorwl(ktmp1, ktmp1, ktmp2);
4966   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4967   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4968   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4969   bind(done);
4970 }
4971 
4972 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4973                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4974                                                                      Register rscratch, AddressLiteral double_sign_flip,
4975                                                                      int vec_enc) {
4976   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4977 
4978   Label done;
4979   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4980   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4981   kortestwl(ktmp1, ktmp1);
4982   jccb(Assembler::equal, done);
4983 
4984   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4985   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4986   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4987 
4988   kxorwl(ktmp1, ktmp1, ktmp2);
4989   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4990   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4991   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4992   bind(done);
4993 }
4994 
4995 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4996                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4997                                                                      Register rscratch, AddressLiteral float_sign_flip,
4998                                                                      int vec_enc) {
4999   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5000   Label done;
5001   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5002   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5003   kortestwl(ktmp1, ktmp1);
5004   jccb(Assembler::equal, done);
5005 
5006   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5007   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5008   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5009 
5010   kxorwl(ktmp1, ktmp1, ktmp2);
5011   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5012   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5013   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5014   bind(done);
5015 }
5016 
5017 /*
5018  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5019  * If src is NaN, the result is 0.
5020  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
5021  * the result is equal to the value of Long.MIN_VALUE.
5022  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
5023  * the result is equal to the value of Long.MAX_VALUE.
5024  */
5025 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5026                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5027                                                                       Register rscratch, AddressLiteral double_sign_flip,
5028                                                                       int vec_enc) {
5029   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5030 
5031   Label done;
5032   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5033   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
5034   kortestwl(ktmp1, ktmp1);
5035   jccb(Assembler::equal, done);
5036 
5037   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5038   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5039   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5040 
5041   kxorwl(ktmp1, ktmp1, ktmp2);
5042   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5043   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5044   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5045   bind(done);
5046 }
5047 
5048 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5049                                                              XMMRegister xtmp, int index, int vec_enc) {
5050    assert(vec_enc < Assembler::AVX_512bit, "");
5051    if (vec_enc == Assembler::AVX_256bit) {
5052      vextractf128_high(xtmp, src);
5053      vshufps(dst, src, xtmp, index, vec_enc);
5054    } else {
5055      vshufps(dst, src, zero, index, vec_enc);
5056    }
5057 }
5058 
5059 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5060                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5061                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
5062   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5063 
5064   Label done;
5065   // Compare the destination lanes with float_sign_flip
5066   // value to get mask for all special values.
5067   movdqu(xtmp1, float_sign_flip, rscratch);
5068   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5069   ptest(xtmp2, xtmp2);
5070   jccb(Assembler::equal, done);
5071 
5072   // Flip float_sign_flip to get max integer value.
5073   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5074   pxor(xtmp1, xtmp4);
5075 
5076   // Set detination lanes corresponding to unordered source lanes as zero.
5077   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5078   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5079 
5080   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5081   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5082   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5083 
5084   // Recompute the mask for remaining special value.
5085   pxor(xtmp2, xtmp3);
5086   // Extract mask corresponding to non-negative source lanes.
5087   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5088 
5089   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5090   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5091   pand(xtmp3, xtmp2);
5092 
5093   // Replace destination lanes holding special value(0x80000000) with max int
5094   // if corresponding source lane holds a +ve value.
5095   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5096   bind(done);
5097 }
5098 
5099 
5100 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5101                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5102   switch(to_elem_bt) {
5103     case T_SHORT:
5104       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5105       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5106       vpackusdw(dst, dst, zero, vec_enc);
5107       if (vec_enc == Assembler::AVX_256bit) {
5108         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5109       }
5110       break;
5111     case  T_BYTE:
5112       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5113       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5114       vpackusdw(dst, dst, zero, vec_enc);
5115       if (vec_enc == Assembler::AVX_256bit) {
5116         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5117       }
5118       vpackuswb(dst, dst, zero, vec_enc);
5119       break;
5120     default: assert(false, "Unexpected basic type for target of vector cast int to subword: %s", type2name(to_elem_bt));
5121   }
5122 }
5123 
5124 /*
5125  * Algorithm for vector D2L and F2I conversions (AVX 10.2 unsupported):-
5126  * a) Perform vector D2L/F2I cast.
5127  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5128  *    It signifies that source value could be any of the special floating point
5129  *    values(NaN,-Inf,Inf,Max,-Min).
5130  * c) Set destination to zero if source is NaN value.
5131  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5132  */
5133 
5134 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5135                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5136                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5137   int to_elem_sz = type2aelembytes(to_elem_bt);
5138   assert(to_elem_sz <= 4, "");
5139   vcvttps2dq(dst, src, vec_enc);
5140   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5141   if (to_elem_sz < 4) {
5142     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5143     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5144   }
5145 }
5146 
5147 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5148                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5149                                             Register rscratch, int vec_enc) {
5150   int to_elem_sz = type2aelembytes(to_elem_bt);
5151   assert(to_elem_sz <= 4, "");
5152   vcvttps2dq(dst, src, vec_enc);
5153   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5154   switch(to_elem_bt) {
5155     case T_INT:
5156       break;
5157     case T_SHORT:
5158       evpmovdw(dst, dst, vec_enc);
5159       break;
5160     case T_BYTE:
5161       evpmovdb(dst, dst, vec_enc);
5162       break;
5163     default: assert(false, "Unexpected basic type for target of vector castF2X EVEX: %s", type2name(to_elem_bt));
5164   }
5165 }
5166 
5167 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5168                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5169                                             Register rscratch, int vec_enc) {
5170   evcvttps2qq(dst, src, vec_enc);
5171   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5172 }
5173 
5174 // Handling for downcasting from double to integer or sub-word types on AVX2.
5175 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5176                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5177                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5178   int to_elem_sz = type2aelembytes(to_elem_bt);
5179   assert(to_elem_sz < 8, "");
5180   vcvttpd2dq(dst, src, vec_enc);
5181   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5182                                               float_sign_flip, vec_enc);
5183   if (to_elem_sz < 4) {
5184     // xtmp4 holds all zero lanes.
5185     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5186   }
5187 }
5188 
5189 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5190                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5191                                             KRegister ktmp2, AddressLiteral sign_flip,
5192                                             Register rscratch, int vec_enc) {
5193   if (VM_Version::supports_avx512dq()) {
5194     evcvttpd2qq(dst, src, vec_enc);
5195     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5196     switch(to_elem_bt) {
5197       case T_LONG:
5198         break;
5199       case T_INT:
5200         evpmovsqd(dst, dst, vec_enc);
5201         break;
5202       case T_SHORT:
5203         evpmovsqd(dst, dst, vec_enc);
5204         evpmovdw(dst, dst, vec_enc);
5205         break;
5206       case T_BYTE:
5207         evpmovsqd(dst, dst, vec_enc);
5208         evpmovdb(dst, dst, vec_enc);
5209         break;
5210       default: assert(false, "Unexpected basic type for target of vector castD2X AVX512DQ EVEX: %s", type2name(to_elem_bt));
5211     }
5212   } else {
5213     assert(type2aelembytes(to_elem_bt) <= 4, "");
5214     vcvttpd2dq(dst, src, vec_enc);
5215     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5216     switch(to_elem_bt) {
5217       case T_INT:
5218         break;
5219       case T_SHORT:
5220         evpmovdw(dst, dst, vec_enc);
5221         break;
5222       case T_BYTE:
5223         evpmovdb(dst, dst, vec_enc);
5224         break;
5225       default: assert(false, "Unexpected basic type for target of vector castD2X EVEX: %s", type2name(to_elem_bt));
5226     }
5227   }
5228 }
5229 
5230 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5231   switch(to_elem_bt) {
5232     case T_LONG:
5233       evcvttps2qqs(dst, src, vec_enc);
5234       break;
5235     case T_INT:
5236       evcvttps2dqs(dst, src, vec_enc);
5237       break;
5238     case T_SHORT:
5239       evcvttps2dqs(dst, src, vec_enc);
5240       evpmovdw(dst, dst, vec_enc);
5241       break;
5242     case T_BYTE:
5243       evcvttps2dqs(dst, src, vec_enc);
5244       evpmovdb(dst, dst, vec_enc);
5245       break;
5246     default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (reg src): %s", type2name(to_elem_bt));
5247   }
5248 }
5249 
5250 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5251   switch(to_elem_bt) {
5252     case T_LONG:
5253       evcvttps2qqs(dst, src, vec_enc);
5254       break;
5255     case T_INT:
5256       evcvttps2dqs(dst, src, vec_enc);
5257       break;
5258     case T_SHORT:
5259       evcvttps2dqs(dst, src, vec_enc);
5260       evpmovdw(dst, dst, vec_enc);
5261       break;
5262     case T_BYTE:
5263       evcvttps2dqs(dst, src, vec_enc);
5264       evpmovdb(dst, dst, vec_enc);
5265       break;
5266     default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (mem src): %s", type2name(to_elem_bt));
5267   }
5268 }
5269 
5270 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5271   switch(to_elem_bt) {
5272     case T_LONG:
5273       evcvttpd2qqs(dst, src, vec_enc);
5274       break;
5275     case T_INT:
5276       evcvttpd2dqs(dst, src, vec_enc);
5277       break;
5278     case T_SHORT:
5279       evcvttpd2dqs(dst, src, vec_enc);
5280       evpmovdw(dst, dst, vec_enc);
5281       break;
5282     case T_BYTE:
5283       evcvttpd2dqs(dst, src, vec_enc);
5284       evpmovdb(dst, dst, vec_enc);
5285       break;
5286     default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (reg src): %s", type2name(to_elem_bt));
5287   }
5288 }
5289 
5290 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5291   switch(to_elem_bt) {
5292     case T_LONG:
5293       evcvttpd2qqs(dst, src, vec_enc);
5294       break;
5295     case T_INT:
5296       evcvttpd2dqs(dst, src, vec_enc);
5297       break;
5298     case T_SHORT:
5299       evcvttpd2dqs(dst, src, vec_enc);
5300       evpmovdw(dst, dst, vec_enc);
5301       break;
5302     case T_BYTE:
5303       evcvttpd2dqs(dst, src, vec_enc);
5304       evpmovdb(dst, dst, vec_enc);
5305       break;
5306     default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (mem src): %s", type2name(to_elem_bt));
5307   }
5308 }
5309 
5310 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5311                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5312                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5313   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5314   // and re-instantiate original MXCSR.RC mode after that.
5315   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5316 
5317   mov64(tmp, julong_cast(0.5L));
5318   evpbroadcastq(xtmp1, tmp, vec_enc);
5319   vaddpd(xtmp1, src , xtmp1, vec_enc);
5320   evcvtpd2qq(dst, xtmp1, vec_enc);
5321   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5322                                                 double_sign_flip, vec_enc);;
5323 
5324   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5325 }
5326 
5327 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5328                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5329                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5330   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5331   // and re-instantiate original MXCSR.RC mode after that.
5332   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5333 
5334   movl(tmp, jint_cast(0.5));
5335   movq(xtmp1, tmp);
5336   vbroadcastss(xtmp1, xtmp1, vec_enc);
5337   vaddps(xtmp1, src , xtmp1, vec_enc);
5338   vcvtps2dq(dst, xtmp1, vec_enc);
5339   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5340                                               float_sign_flip, vec_enc);
5341 
5342   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5343 }
5344 
5345 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5346                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5347                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5348   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5349   // and re-instantiate original MXCSR.RC mode after that.
5350   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5351 
5352   movl(tmp, jint_cast(0.5));
5353   movq(xtmp1, tmp);
5354   vbroadcastss(xtmp1, xtmp1, vec_enc);
5355   vaddps(xtmp1, src , xtmp1, vec_enc);
5356   vcvtps2dq(dst, xtmp1, vec_enc);
5357   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5358 
5359   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5360 }
5361 
5362 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5363                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5364   switch (from_elem_bt) {
5365     case T_BYTE:
5366       switch (to_elem_bt) {
5367         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5368         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5369         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5370         default: ShouldNotReachHere();
5371       }
5372       break;
5373     case T_SHORT:
5374       switch (to_elem_bt) {
5375         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5376         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5377         default: ShouldNotReachHere();
5378       }
5379       break;
5380     case T_INT:
5381       assert(to_elem_bt == T_LONG, "");
5382       vpmovzxdq(dst, src, vlen_enc);
5383       break;
5384     default:
5385       ShouldNotReachHere();
5386   }
5387 }
5388 
5389 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5390                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5391   switch (from_elem_bt) {
5392     case T_BYTE:
5393       switch (to_elem_bt) {
5394         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5395         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5396         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5397         default: ShouldNotReachHere();
5398       }
5399       break;
5400     case T_SHORT:
5401       switch (to_elem_bt) {
5402         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5403         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5404         default: ShouldNotReachHere();
5405       }
5406       break;
5407     case T_INT:
5408       assert(to_elem_bt == T_LONG, "");
5409       vpmovsxdq(dst, src, vlen_enc);
5410       break;
5411     default:
5412       ShouldNotReachHere();
5413   }
5414 }
5415 
5416 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5417                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5418   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5419   assert(vlen_enc != AVX_512bit, "");
5420 
5421   int dst_bt_size = type2aelembytes(dst_bt);
5422   int src_bt_size = type2aelembytes(src_bt);
5423   if (dst_bt_size > src_bt_size) {
5424     switch (dst_bt_size / src_bt_size) {
5425       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5426       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5427       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5428       default: ShouldNotReachHere();
5429     }
5430   } else {
5431     assert(dst_bt_size < src_bt_size, "");
5432     switch (src_bt_size / dst_bt_size) {
5433       case 2: {
5434         if (vlen_enc == AVX_128bit) {
5435           vpacksswb(dst, src, src, vlen_enc);
5436         } else {
5437           vpacksswb(dst, src, src, vlen_enc);
5438           vpermq(dst, dst, 0x08, vlen_enc);
5439         }
5440         break;
5441       }
5442       case 4: {
5443         if (vlen_enc == AVX_128bit) {
5444           vpackssdw(dst, src, src, vlen_enc);
5445           vpacksswb(dst, dst, dst, vlen_enc);
5446         } else {
5447           vpackssdw(dst, src, src, vlen_enc);
5448           vpermq(dst, dst, 0x08, vlen_enc);
5449           vpacksswb(dst, dst, dst, AVX_128bit);
5450         }
5451         break;
5452       }
5453       case 8: {
5454         if (vlen_enc == AVX_128bit) {
5455           vpshufd(dst, src, 0x08, vlen_enc);
5456           vpackssdw(dst, dst, dst, vlen_enc);
5457           vpacksswb(dst, dst, dst, vlen_enc);
5458         } else {
5459           vpshufd(dst, src, 0x08, vlen_enc);
5460           vpermq(dst, dst, 0x08, vlen_enc);
5461           vpackssdw(dst, dst, dst, AVX_128bit);
5462           vpacksswb(dst, dst, dst, AVX_128bit);
5463         }
5464         break;
5465       }
5466       default: ShouldNotReachHere();
5467     }
5468   }
5469 }
5470 
5471 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5472                                    bool merge, BasicType bt, int vlen_enc) {
5473   if (bt == T_INT) {
5474     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5475   } else {
5476     assert(bt == T_LONG, "");
5477     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5478   }
5479 }
5480 
5481 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5482                                    bool merge, BasicType bt, int vlen_enc) {
5483   if (bt == T_INT) {
5484     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5485   } else {
5486     assert(bt == T_LONG, "");
5487     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5488   }
5489 }
5490 
5491 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5492                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5493                                                int vec_enc) {
5494   int index = 0;
5495   int vindex = 0;
5496   mov64(rtmp1, 0x0101010101010101L);
5497   pdepq(rtmp1, src, rtmp1);
5498   if (mask_len > 8) {
5499     movq(rtmp2, src);
5500     vpxor(xtmp, xtmp, xtmp, vec_enc);
5501     movq(xtmp, rtmp1);
5502   }
5503   movq(dst, rtmp1);
5504 
5505   mask_len -= 8;
5506   while (mask_len > 0) {
5507     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5508     index++;
5509     if ((index % 2) == 0) {
5510       pxor(xtmp, xtmp);
5511     }
5512     mov64(rtmp1, 0x0101010101010101L);
5513     shrq(rtmp2, 8);
5514     pdepq(rtmp1, rtmp2, rtmp1);
5515     pinsrq(xtmp, rtmp1, index % 2);
5516     vindex = index / 2;
5517     if (vindex) {
5518       // Write entire 16 byte vector when both 64 bit
5519       // lanes are update to save redundant instructions.
5520       if (index % 2) {
5521         vinsertf128(dst, dst, xtmp, vindex);
5522       }
5523     } else {
5524       vmovdqu(dst, xtmp);
5525     }
5526     mask_len -= 8;
5527   }
5528 }
5529 
5530 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5531   switch(opc) {
5532     case Op_VectorMaskTrueCount:
5533       popcntq(dst, tmp);
5534       break;
5535     case Op_VectorMaskLastTrue:
5536       if (VM_Version::supports_lzcnt()) {
5537         lzcntq(tmp, tmp);
5538         movl(dst, 63);
5539         subl(dst, tmp);
5540       } else {
5541         movl(dst, -1);
5542         bsrq(tmp, tmp);
5543         cmov32(Assembler::notZero, dst, tmp);
5544       }
5545       break;
5546     case Op_VectorMaskFirstTrue:
5547       if (VM_Version::supports_bmi1()) {
5548         if (masklen < 32) {
5549           orl(tmp, 1 << masklen);
5550           tzcntl(dst, tmp);
5551         } else if (masklen == 32) {
5552           tzcntl(dst, tmp);
5553         } else {
5554           assert(masklen == 64, "");
5555           tzcntq(dst, tmp);
5556         }
5557       } else {
5558         if (masklen < 32) {
5559           orl(tmp, 1 << masklen);
5560           bsfl(dst, tmp);
5561         } else {
5562           assert(masklen == 32 || masklen == 64, "");
5563           movl(dst, masklen);
5564           if (masklen == 32)  {
5565             bsfl(tmp, tmp);
5566           } else {
5567             bsfq(tmp, tmp);
5568           }
5569           cmov32(Assembler::notZero, dst, tmp);
5570         }
5571       }
5572       break;
5573     case Op_VectorMaskToLong:
5574       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5575       break;
5576     default: assert(false, "Unhandled mask operation");
5577   }
5578 }
5579 
5580 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5581                                               int masklen, int masksize, int vec_enc) {
5582   assert(VM_Version::supports_popcnt(), "");
5583 
5584   if(VM_Version::supports_avx512bw()) {
5585     kmovql(tmp, mask);
5586   } else {
5587     assert(masklen <= 16, "");
5588     kmovwl(tmp, mask);
5589   }
5590 
5591   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5592   // operations needs to be clipped.
5593   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5594     andq(tmp, (1 << masklen) - 1);
5595   }
5596 
5597   vector_mask_operation_helper(opc, dst, tmp, masklen);
5598 }
5599 
5600 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5601                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5602   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5603          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5604   assert(VM_Version::supports_popcnt(), "");
5605 
5606   bool need_clip = false;
5607   switch(bt) {
5608     case T_BOOLEAN:
5609       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5610       vpxor(xtmp, xtmp, xtmp, vec_enc);
5611       vpsubb(xtmp, xtmp, mask, vec_enc);
5612       vpmovmskb(tmp, xtmp, vec_enc);
5613       need_clip = masklen < 16;
5614       break;
5615     case T_BYTE:
5616       vpmovmskb(tmp, mask, vec_enc);
5617       need_clip = masklen < 16;
5618       break;
5619     case T_SHORT:
5620       vpacksswb(xtmp, mask, mask, vec_enc);
5621       if (masklen >= 16) {
5622         vpermpd(xtmp, xtmp, 8, vec_enc);
5623       }
5624       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5625       need_clip = masklen < 16;
5626       break;
5627     case T_INT:
5628     case T_FLOAT:
5629       vmovmskps(tmp, mask, vec_enc);
5630       need_clip = masklen < 4;
5631       break;
5632     case T_LONG:
5633     case T_DOUBLE:
5634       vmovmskpd(tmp, mask, vec_enc);
5635       need_clip = masklen < 2;
5636       break;
5637     default: assert(false, "Unhandled type, %s", type2name(bt));
5638   }
5639 
5640   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5641   // operations needs to be clipped.
5642   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5643     // need_clip implies masklen < 32
5644     andq(tmp, (1 << masklen) - 1);
5645   }
5646 
5647   vector_mask_operation_helper(opc, dst, tmp, masklen);
5648 }
5649 
5650 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5651                                              Register rtmp2, int mask_len) {
5652   kmov(rtmp1, src);
5653   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5654   mov64(rtmp2, -1L);
5655   pextq(rtmp2, rtmp2, rtmp1);
5656   kmov(dst, rtmp2);
5657 }
5658 
5659 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5660                                                     XMMRegister mask, Register rtmp, Register rscratch,
5661                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5662                                                     int vec_enc) {
5663   assert(type2aelembytes(bt) >= 4, "");
5664   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5665   address compress_perm_table = nullptr;
5666   address expand_perm_table = nullptr;
5667   if (type2aelembytes(bt) == 8) {
5668     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5669     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5670     vmovmskpd(rtmp, mask, vec_enc);
5671   } else {
5672     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5673     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5674     vmovmskps(rtmp, mask, vec_enc);
5675   }
5676   shlq(rtmp, 5); // for 32 byte permute row.
5677   if (opcode == Op_CompressV) {
5678     lea(rscratch, ExternalAddress(compress_perm_table));
5679   } else {
5680     lea(rscratch, ExternalAddress(expand_perm_table));
5681   }
5682   addptr(rtmp, rscratch);
5683   vmovdqu(permv, Address(rtmp));
5684   vpermps(dst, permv, src, Assembler::AVX_256bit);
5685   vpxor(xtmp, xtmp, xtmp, vec_enc);
5686   // Blend the result with zero vector using permute mask, each column entry
5687   // in a permute table row contains either a valid permute index or a -1 (default)
5688   // value, this can potentially be used as a blending mask after
5689   // compressing/expanding the source vector lanes.
5690   vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv);
5691 }
5692 
5693 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5694                                                bool merge, BasicType bt, int vec_enc) {
5695   if (opcode == Op_CompressV) {
5696     switch(bt) {
5697     case T_BYTE:
5698       evpcompressb(dst, mask, src, merge, vec_enc);
5699       break;
5700     case T_CHAR:
5701     case T_SHORT:
5702       evpcompressw(dst, mask, src, merge, vec_enc);
5703       break;
5704     case T_INT:
5705       evpcompressd(dst, mask, src, merge, vec_enc);
5706       break;
5707     case T_FLOAT:
5708       evcompressps(dst, mask, src, merge, vec_enc);
5709       break;
5710     case T_LONG:
5711       evpcompressq(dst, mask, src, merge, vec_enc);
5712       break;
5713     case T_DOUBLE:
5714       evcompresspd(dst, mask, src, merge, vec_enc);
5715       break;
5716     default:
5717       fatal("Unsupported type %s", type2name(bt));
5718       break;
5719     }
5720   } else {
5721     assert(opcode == Op_ExpandV, "");
5722     switch(bt) {
5723     case T_BYTE:
5724       evpexpandb(dst, mask, src, merge, vec_enc);
5725       break;
5726     case T_CHAR:
5727     case T_SHORT:
5728       evpexpandw(dst, mask, src, merge, vec_enc);
5729       break;
5730     case T_INT:
5731       evpexpandd(dst, mask, src, merge, vec_enc);
5732       break;
5733     case T_FLOAT:
5734       evexpandps(dst, mask, src, merge, vec_enc);
5735       break;
5736     case T_LONG:
5737       evpexpandq(dst, mask, src, merge, vec_enc);
5738       break;
5739     case T_DOUBLE:
5740       evexpandpd(dst, mask, src, merge, vec_enc);
5741       break;
5742     default:
5743       fatal("Unsupported type %s", type2name(bt));
5744       break;
5745     }
5746   }
5747 }
5748 
5749 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5750                                            KRegister ktmp1, int vec_enc) {
5751   if (opcode == Op_SignumVD) {
5752     vsubpd(dst, zero, one, vec_enc);
5753     // if src < 0 ? -1 : 1
5754     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5755     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5756     // if src == NaN, -0.0 or 0.0 return src.
5757     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5758     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5759   } else {
5760     assert(opcode == Op_SignumVF, "");
5761     vsubps(dst, zero, one, vec_enc);
5762     // if src < 0 ? -1 : 1
5763     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5764     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5765     // if src == NaN, -0.0 or 0.0 return src.
5766     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5767     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5768   }
5769 }
5770 
5771 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5772                                           XMMRegister xtmp1, int vec_enc) {
5773   if (opcode == Op_SignumVD) {
5774     vsubpd(dst, zero, one, vec_enc);
5775     // if src < 0 ? -1 : 1
5776     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5777     // if src == NaN, -0.0 or 0.0 return src.
5778     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5779     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5780   } else {
5781     assert(opcode == Op_SignumVF, "");
5782     vsubps(dst, zero, one, vec_enc);
5783     // if src < 0 ? -1 : 1
5784     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5785     // if src == NaN, -0.0 or 0.0 return src.
5786     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5787     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5788   }
5789 }
5790 
5791 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5792   if (VM_Version::supports_avx512bw()) {
5793     if (mask_len > 32) {
5794       kmovql(dst, src);
5795     } else {
5796       kmovdl(dst, src);
5797       if (mask_len != 32) {
5798         kshiftrdl(dst, dst, 32 - mask_len);
5799       }
5800     }
5801   } else {
5802     assert(mask_len <= 16, "");
5803     kmovwl(dst, src);
5804     if (mask_len != 16) {
5805       kshiftrwl(dst, dst, 16 - mask_len);
5806     }
5807   }
5808 }
5809 
5810 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5811   int lane_size = type2aelembytes(bt);
5812   if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5813       (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) {
5814     movptr(rtmp, imm32);
5815     switch(lane_size) {
5816       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5817       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5818       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5819       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5820       fatal("Unsupported lane size %d", lane_size);
5821       break;
5822     }
5823   } else {
5824     movptr(rtmp, imm32);
5825     movq(dst, rtmp);
5826     switch(lane_size) {
5827       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5828       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5829       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5830       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5831       fatal("Unsupported lane size %d", lane_size);
5832       break;
5833     }
5834   }
5835 }
5836 
5837 //
5838 // Following is lookup table based popcount computation algorithm:-
5839 //       Index   Bit set count
5840 //     [ 0000 ->   0,
5841 //       0001 ->   1,
5842 //       0010 ->   1,
5843 //       0011 ->   2,
5844 //       0100 ->   1,
5845 //       0101 ->   2,
5846 //       0110 ->   2,
5847 //       0111 ->   3,
5848 //       1000 ->   1,
5849 //       1001 ->   2,
5850 //       1010 ->   3,
5851 //       1011 ->   3,
5852 //       1100 ->   2,
5853 //       1101 ->   3,
5854 //       1111 ->   4 ]
5855 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5856 //     shuffle indices for lookup table access.
5857 //  b. Right shift each byte of vector lane by 4 positions.
5858 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5859 //     shuffle indices for lookup table access.
5860 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5861 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5862 //     count of all the bytes of a quadword.
5863 //  f. Perform step e. for upper 128bit vector lane.
5864 //  g. Pack the bitset count of quadwords back to double word.
5865 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5866 
5867 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5868                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5869   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5870   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5871   vpsrlw(dst, src, 4, vec_enc);
5872   vpand(dst, dst, xtmp1, vec_enc);
5873   vpand(xtmp1, src, xtmp1, vec_enc);
5874   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5875   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5876   vpshufb(dst, xtmp2, dst, vec_enc);
5877   vpaddb(dst, dst, xtmp1, vec_enc);
5878 }
5879 
5880 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5881                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5882   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5883   // Following code is as per steps e,f,g and h of above algorithm.
5884   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5885   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5886   vpsadbw(dst, dst, xtmp2, vec_enc);
5887   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5888   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5889   vpackuswb(dst, xtmp1, dst, vec_enc);
5890 }
5891 
5892 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5893                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5894   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5895   // Add the popcount of upper and lower bytes of word.
5896   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5897   vpsrlw(dst, xtmp1, 8, vec_enc);
5898   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5899   vpaddw(dst, dst, xtmp1, vec_enc);
5900 }
5901 
5902 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5903                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5904   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5905   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5906   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5907 }
5908 
5909 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5910                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5911   switch(bt) {
5912     case T_LONG:
5913       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5914       break;
5915     case T_INT:
5916       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5917       break;
5918     case T_CHAR:
5919     case T_SHORT:
5920       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5921       break;
5922     case T_BYTE:
5923     case T_BOOLEAN:
5924       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5925       break;
5926     default:
5927       fatal("Unsupported type %s", type2name(bt));
5928       break;
5929   }
5930 }
5931 
5932 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5933                                                       KRegister mask, bool merge, int vec_enc) {
5934   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5935   switch(bt) {
5936     case T_LONG:
5937       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5938       evpopcntq(dst, mask, src, merge, vec_enc);
5939       break;
5940     case T_INT:
5941       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5942       evpopcntd(dst, mask, src, merge, vec_enc);
5943       break;
5944     case T_CHAR:
5945     case T_SHORT:
5946       assert(VM_Version::supports_avx512_bitalg(), "");
5947       evpopcntw(dst, mask, src, merge, vec_enc);
5948       break;
5949     case T_BYTE:
5950     case T_BOOLEAN:
5951       assert(VM_Version::supports_avx512_bitalg(), "");
5952       evpopcntb(dst, mask, src, merge, vec_enc);
5953       break;
5954     default:
5955       fatal("Unsupported type %s", type2name(bt));
5956       break;
5957   }
5958 }
5959 
5960 // Bit reversal algorithm first reverses the bits of each byte followed by
5961 // a byte level reversal for multi-byte primitive types (short/int/long).
5962 // Algorithm performs a lookup table access to get reverse bit sequence
5963 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5964 // is obtained by swapping the reverse bit sequences of upper and lower
5965 // nibble of a byte.
5966 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5967                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5968   if (VM_Version::supports_avx512vlbw()) {
5969 
5970     // Get the reverse bit sequence of lower nibble of each byte.
5971     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5972     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5973     evpandq(dst, xtmp2, src, vec_enc);
5974     vpshufb(dst, xtmp1, dst, vec_enc);
5975     vpsllq(dst, dst, 4, vec_enc);
5976 
5977     // Get the reverse bit sequence of upper nibble of each byte.
5978     vpandn(xtmp2, xtmp2, src, vec_enc);
5979     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5980     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5981 
5982     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5983     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5984     evporq(xtmp2, dst, xtmp2, vec_enc);
5985     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5986 
5987   } else if(vec_enc == Assembler::AVX_512bit) {
5988     // Shift based bit reversal.
5989     assert(bt == T_LONG || bt == T_INT, "");
5990 
5991     // Swap lower and upper nibble of each byte.
5992     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5993 
5994     // Swap two least and most significant bits of each nibble.
5995     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5996 
5997     // Swap adjacent pair of bits.
5998     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5999     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
6000 
6001     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6002     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
6003   } else {
6004     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
6005     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6006 
6007     // Get the reverse bit sequence of lower nibble of each byte.
6008     vpand(dst, xtmp2, src, vec_enc);
6009     vpshufb(dst, xtmp1, dst, vec_enc);
6010     vpsllq(dst, dst, 4, vec_enc);
6011 
6012     // Get the reverse bit sequence of upper nibble of each byte.
6013     vpandn(xtmp2, xtmp2, src, vec_enc);
6014     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6015     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6016 
6017     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6018     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6019     vpor(xtmp2, dst, xtmp2, vec_enc);
6020     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6021   }
6022 }
6023 
6024 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
6025                                                 XMMRegister xtmp, Register rscratch) {
6026   assert(VM_Version::supports_gfni(), "");
6027   assert(rscratch != noreg || always_reachable(mask), "missing");
6028 
6029   // Galois field instruction based bit reversal based on following algorithm.
6030   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6031   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
6032   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
6033   vector_reverse_byte(bt, dst, xtmp, vec_enc);
6034 }
6035 
6036 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
6037                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
6038   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
6039   evpandq(dst, xtmp1, src, vec_enc);
6040   vpsllq(dst, dst, nbits, vec_enc);
6041   vpandn(xtmp1, xtmp1, src, vec_enc);
6042   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
6043   evporq(dst, dst, xtmp1, vec_enc);
6044 }
6045 
6046 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6047                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
6048   // Shift based bit reversal.
6049   assert(VM_Version::supports_evex(), "");
6050   switch(bt) {
6051     case T_LONG:
6052       // Swap upper and lower double word of each quad word.
6053       evprorq(xtmp1, k0, src, 32, true, vec_enc);
6054       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
6055       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6056       break;
6057     case T_INT:
6058       // Swap upper and lower word of each double word.
6059       evprord(xtmp1, k0, src, 16, true, vec_enc);
6060       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6061       break;
6062     case T_CHAR:
6063     case T_SHORT:
6064       // Swap upper and lower byte of each word.
6065       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6066       break;
6067     case T_BYTE:
6068       evmovdquq(dst, k0, src, true, vec_enc);
6069       break;
6070     default:
6071       fatal("Unsupported type %s", type2name(bt));
6072       break;
6073   }
6074 }
6075 
6076 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6077   if (bt == T_BYTE) {
6078     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6079       evmovdquq(dst, k0, src, true, vec_enc);
6080     } else {
6081       vmovdqu(dst, src);
6082     }
6083     return;
6084   }
6085   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6086   // pre-computed shuffle indices.
6087   switch(bt) {
6088     case T_LONG:
6089       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6090       break;
6091     case T_INT:
6092       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6093       break;
6094     case T_CHAR:
6095     case T_SHORT:
6096       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6097       break;
6098     default:
6099       fatal("Unsupported type %s", type2name(bt));
6100       break;
6101   }
6102   vpshufb(dst, src, dst, vec_enc);
6103 }
6104 
6105 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6106                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6107                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6108   assert(is_integral_type(bt), "");
6109   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6110   assert(VM_Version::supports_avx512cd(), "");
6111   switch(bt) {
6112     case T_LONG:
6113       evplzcntq(dst, ktmp, src, merge, vec_enc);
6114       break;
6115     case T_INT:
6116       evplzcntd(dst, ktmp, src, merge, vec_enc);
6117       break;
6118     case T_SHORT:
6119       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6120       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6121       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6122       vpunpckhwd(dst, xtmp1, src, vec_enc);
6123       evplzcntd(dst, ktmp, dst, merge, vec_enc);
6124       vpackusdw(dst, xtmp2, dst, vec_enc);
6125       break;
6126     case T_BYTE:
6127       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6128       // accessing the lookup table.
6129       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6130       // accessing the lookup table.
6131       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6132       assert(VM_Version::supports_avx512bw(), "");
6133       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6134       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6135       vpand(xtmp2, dst, src, vec_enc);
6136       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6137       vpsrlw(xtmp3, src, 4, vec_enc);
6138       vpand(xtmp3, dst, xtmp3, vec_enc);
6139       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6140       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6141       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6142       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6143       break;
6144     default:
6145       fatal("Unsupported type %s", type2name(bt));
6146       break;
6147   }
6148 }
6149 
6150 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6151                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6152   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6153   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6154   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6155   // accessing the lookup table.
6156   vpand(dst, xtmp2, src, vec_enc);
6157   vpshufb(dst, xtmp1, dst, vec_enc);
6158   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6159   // accessing the lookup table.
6160   vpsrlw(xtmp3, src, 4, vec_enc);
6161   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6162   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6163   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6164   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6165   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6166   vpaddb(dst, dst, xtmp2, vec_enc);
6167   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6168 }
6169 
6170 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6171                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6172   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6173   // Add zero counts of lower byte and upper byte of a word if
6174   // upper byte holds a zero value.
6175   vpsrlw(xtmp3, src, 8, vec_enc);
6176   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6177   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6178   vpsllw(xtmp2, dst, 8, vec_enc);
6179   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6180   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6181   vpsrlw(dst, dst, 8, vec_enc);
6182 }
6183 
6184 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6185                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6186   // By converting the integer to a float, we can obtain the number of leading zeros based on the exponent of the float.
6187   // As the float exponent contains a bias of 127 for nonzero values, the bias must be removed before interpreting the
6188   // exponent as the leading zero count.
6189 
6190   // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher
6191   // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit
6192   // contributes to the leading number of zeros.
6193   vpsrld(dst, src, 1, vec_enc);
6194   vpandn(dst, dst, src, vec_enc);
6195 
6196   vcvtdq2ps(dst, dst, vec_enc);
6197 
6198   // By comparing the register to itself, all the bits in the destination are set.
6199   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6200 
6201   // Move the biased exponent to the low end of the lane and mask with 0xFF to discard the sign bit.
6202   vpsrld(xtmp2, xtmp1, 24, vec_enc);
6203   vpsrld(dst, dst, 23, vec_enc);
6204   vpand(dst, xtmp2, dst, vec_enc);
6205 
6206   // Subtract 127 from the exponent, which removes the bias from the exponent.
6207   vpsrld(xtmp2, xtmp1, 25, vec_enc);
6208   vpsubd(dst, dst, xtmp2, vec_enc);
6209 
6210   vpsrld(xtmp2, xtmp1, 27, vec_enc);
6211 
6212   // If the original value is 0 the exponent would not have bias, so the subtraction creates a negative number. If this
6213   // is found in any of the lanes, replace the lane with -1 from xtmp1.
6214   vblendvps(dst, dst, xtmp1, dst, vec_enc, true, xtmp3);
6215 
6216   // If the original value is negative, replace the lane with 31.
6217   vblendvps(dst, dst, xtmp2, src, vec_enc, true, xtmp3);
6218 
6219   // Subtract the exponent from 31, giving the final result. For 0, the result is 32 as the exponent was replaced with -1,
6220   // and for negative numbers the result is 0 as the exponent was replaced with 31.
6221   vpsubd(dst, xtmp2, dst, vec_enc);
6222 }
6223 
6224 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6225                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6226   // Find the leading zeros of the top and bottom halves of the long individually.
6227   vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6228 
6229   // Move the top half result to the bottom half of xtmp1, setting the top half to 0.
6230   vpsrlq(xtmp1, dst, 32, vec_enc);
6231   // By moving the top half result to the right by 6 bits, if the top half was empty (i.e. 32 is returned) the result bit will
6232   // be in the most significant position of the bottom half.
6233   vpsrlq(xtmp2, dst, 6, vec_enc);
6234 
6235   // In the bottom half, add the top half and bottom half results.
6236   vpaddq(dst, xtmp1, dst, vec_enc);
6237 
6238   // For the bottom half, choose between the values using the most significant bit of xtmp2.
6239   // If the MSB is set, then bottom+top in dst is the resulting value. If the top half is less than 32 xtmp1 is chosen,
6240   // which contains only the top half result.
6241   // In the top half the MSB is always zero, so the value in xtmp1 is always chosen. This value is always 0, which clears
6242   // the lane as required.
6243   vblendvps(dst, xtmp1, dst, xtmp2, vec_enc, true, xtmp3);
6244 }
6245 
6246 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6247                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6248                                                        Register rtmp, int vec_enc) {
6249   assert(is_integral_type(bt), "unexpected type");
6250   assert(vec_enc < Assembler::AVX_512bit, "");
6251   switch(bt) {
6252     case T_LONG:
6253       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6254       break;
6255     case T_INT:
6256       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6257       break;
6258     case T_SHORT:
6259       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6260       break;
6261     case T_BYTE:
6262       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6263       break;
6264     default:
6265       fatal("Unsupported type %s", type2name(bt));
6266       break;
6267   }
6268 }
6269 
6270 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6271   switch(bt) {
6272     case T_BYTE:
6273       vpsubb(dst, src1, src2, vec_enc);
6274       break;
6275     case T_SHORT:
6276       vpsubw(dst, src1, src2, vec_enc);
6277       break;
6278     case T_INT:
6279       vpsubd(dst, src1, src2, vec_enc);
6280       break;
6281     case T_LONG:
6282       vpsubq(dst, src1, src2, vec_enc);
6283       break;
6284     default:
6285       fatal("Unsupported type %s", type2name(bt));
6286       break;
6287   }
6288 }
6289 
6290 // Trailing zero count computation is based on leading zero count operation as per
6291 // following equation. All AVX3 targets support AVX512CD feature which offers
6292 // direct vector instruction to compute leading zero count.
6293 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6294 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6295                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6296                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6297   assert(is_integral_type(bt), "");
6298   // xtmp = -1
6299   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6300   // xtmp = xtmp + src
6301   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6302   // xtmp = xtmp & ~src
6303   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6304   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6305   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6306   vpsub(bt, dst, xtmp4, dst, vec_enc);
6307 }
6308 
6309 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6310 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6311 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6312                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6313   assert(is_integral_type(bt), "");
6314   // xtmp = 0
6315   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6316   // xtmp = 0 - src
6317   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6318   // xtmp = xtmp | src
6319   vpor(xtmp3, xtmp3, src, vec_enc);
6320   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6321   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6322   vpsub(bt, dst, xtmp1, dst, vec_enc);
6323 }
6324 
6325 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6326   Label done;
6327   Label neg_divisor_fastpath;
6328   cmpl(divisor, 0);
6329   jccb(Assembler::less, neg_divisor_fastpath);
6330   xorl(rdx, rdx);
6331   divl(divisor);
6332   jmpb(done);
6333   bind(neg_divisor_fastpath);
6334   // Fastpath for divisor < 0:
6335   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6336   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6337   movl(rdx, rax);
6338   subl(rdx, divisor);
6339   if (VM_Version::supports_bmi1()) {
6340     andnl(rax, rdx, rax);
6341   } else {
6342     notl(rdx);
6343     andl(rax, rdx);
6344   }
6345   shrl(rax, 31);
6346   bind(done);
6347 }
6348 
6349 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6350   Label done;
6351   Label neg_divisor_fastpath;
6352   cmpl(divisor, 0);
6353   jccb(Assembler::less, neg_divisor_fastpath);
6354   xorl(rdx, rdx);
6355   divl(divisor);
6356   jmpb(done);
6357   bind(neg_divisor_fastpath);
6358   // Fastpath when divisor < 0:
6359   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6360   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6361   movl(rdx, rax);
6362   subl(rax, divisor);
6363   if (VM_Version::supports_bmi1()) {
6364     andnl(rax, rax, rdx);
6365   } else {
6366     notl(rax);
6367     andl(rax, rdx);
6368   }
6369   sarl(rax, 31);
6370   andl(rax, divisor);
6371   subl(rdx, rax);
6372   bind(done);
6373 }
6374 
6375 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6376   Label done;
6377   Label neg_divisor_fastpath;
6378 
6379   cmpl(divisor, 0);
6380   jccb(Assembler::less, neg_divisor_fastpath);
6381   xorl(rdx, rdx);
6382   divl(divisor);
6383   jmpb(done);
6384   bind(neg_divisor_fastpath);
6385   // Fastpath for divisor < 0:
6386   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6387   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6388   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6389   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6390   movl(rdx, rax);
6391   subl(rax, divisor);
6392   if (VM_Version::supports_bmi1()) {
6393     andnl(rax, rax, rdx);
6394   } else {
6395     notl(rax);
6396     andl(rax, rdx);
6397   }
6398   movl(tmp, rax);
6399   shrl(rax, 31); // quotient
6400   sarl(tmp, 31);
6401   andl(tmp, divisor);
6402   subl(rdx, tmp); // remainder
6403   bind(done);
6404 }
6405 
6406 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6407                                  XMMRegister xtmp2, Register rtmp) {
6408   if(VM_Version::supports_gfni()) {
6409     // Galois field instruction based bit reversal based on following algorithm.
6410     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6411     mov64(rtmp, 0x8040201008040201L);
6412     movq(xtmp1, src);
6413     movq(xtmp2, rtmp);
6414     gf2p8affineqb(xtmp1, xtmp2, 0);
6415     movq(dst, xtmp1);
6416   } else {
6417     // Swap even and odd numbered bits.
6418     movl(rtmp, src);
6419     andl(rtmp, 0x55555555);
6420     shll(rtmp, 1);
6421     movl(dst, src);
6422     andl(dst, 0xAAAAAAAA);
6423     shrl(dst, 1);
6424     orl(dst, rtmp);
6425 
6426     // Swap LSB and MSB 2 bits of each nibble.
6427     movl(rtmp, dst);
6428     andl(rtmp, 0x33333333);
6429     shll(rtmp, 2);
6430     andl(dst, 0xCCCCCCCC);
6431     shrl(dst, 2);
6432     orl(dst, rtmp);
6433 
6434     // Swap LSB and MSB 4 bits of each byte.
6435     movl(rtmp, dst);
6436     andl(rtmp, 0x0F0F0F0F);
6437     shll(rtmp, 4);
6438     andl(dst, 0xF0F0F0F0);
6439     shrl(dst, 4);
6440     orl(dst, rtmp);
6441   }
6442   bswapl(dst);
6443 }
6444 
6445 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6446                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6447   if(VM_Version::supports_gfni()) {
6448     // Galois field instruction based bit reversal based on following algorithm.
6449     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6450     mov64(rtmp1, 0x8040201008040201L);
6451     movq(xtmp1, src);
6452     movq(xtmp2, rtmp1);
6453     gf2p8affineqb(xtmp1, xtmp2, 0);
6454     movq(dst, xtmp1);
6455   } else {
6456     // Swap even and odd numbered bits.
6457     movq(rtmp1, src);
6458     mov64(rtmp2, 0x5555555555555555L);
6459     andq(rtmp1, rtmp2);
6460     shlq(rtmp1, 1);
6461     movq(dst, src);
6462     notq(rtmp2);
6463     andq(dst, rtmp2);
6464     shrq(dst, 1);
6465     orq(dst, rtmp1);
6466 
6467     // Swap LSB and MSB 2 bits of each nibble.
6468     movq(rtmp1, dst);
6469     mov64(rtmp2, 0x3333333333333333L);
6470     andq(rtmp1, rtmp2);
6471     shlq(rtmp1, 2);
6472     notq(rtmp2);
6473     andq(dst, rtmp2);
6474     shrq(dst, 2);
6475     orq(dst, rtmp1);
6476 
6477     // Swap LSB and MSB 4 bits of each byte.
6478     movq(rtmp1, dst);
6479     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6480     andq(rtmp1, rtmp2);
6481     shlq(rtmp1, 4);
6482     notq(rtmp2);
6483     andq(dst, rtmp2);
6484     shrq(dst, 4);
6485     orq(dst, rtmp1);
6486   }
6487   bswapq(dst);
6488 }
6489 
6490 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6491   Label done;
6492   Label neg_divisor_fastpath;
6493   cmpq(divisor, 0);
6494   jccb(Assembler::less, neg_divisor_fastpath);
6495   xorl(rdx, rdx);
6496   divq(divisor);
6497   jmpb(done);
6498   bind(neg_divisor_fastpath);
6499   // Fastpath for divisor < 0:
6500   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6501   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6502   movq(rdx, rax);
6503   subq(rdx, divisor);
6504   if (VM_Version::supports_bmi1()) {
6505     andnq(rax, rdx, rax);
6506   } else {
6507     notq(rdx);
6508     andq(rax, rdx);
6509   }
6510   shrq(rax, 63);
6511   bind(done);
6512 }
6513 
6514 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6515   Label done;
6516   Label neg_divisor_fastpath;
6517   cmpq(divisor, 0);
6518   jccb(Assembler::less, neg_divisor_fastpath);
6519   xorq(rdx, rdx);
6520   divq(divisor);
6521   jmp(done);
6522   bind(neg_divisor_fastpath);
6523   // Fastpath when divisor < 0:
6524   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6525   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6526   movq(rdx, rax);
6527   subq(rax, divisor);
6528   if (VM_Version::supports_bmi1()) {
6529     andnq(rax, rax, rdx);
6530   } else {
6531     notq(rax);
6532     andq(rax, rdx);
6533   }
6534   sarq(rax, 63);
6535   andq(rax, divisor);
6536   subq(rdx, rax);
6537   bind(done);
6538 }
6539 
6540 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6541   Label done;
6542   Label neg_divisor_fastpath;
6543   cmpq(divisor, 0);
6544   jccb(Assembler::less, neg_divisor_fastpath);
6545   xorq(rdx, rdx);
6546   divq(divisor);
6547   jmp(done);
6548   bind(neg_divisor_fastpath);
6549   // Fastpath for divisor < 0:
6550   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6551   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6552   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6553   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6554   movq(rdx, rax);
6555   subq(rax, divisor);
6556   if (VM_Version::supports_bmi1()) {
6557     andnq(rax, rax, rdx);
6558   } else {
6559     notq(rax);
6560     andq(rax, rdx);
6561   }
6562   movq(tmp, rax);
6563   shrq(rax, 63); // quotient
6564   sarq(tmp, 63);
6565   andq(tmp, divisor);
6566   subq(rdx, tmp); // remainder
6567   bind(done);
6568 }
6569 
6570 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6571                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6572                                         int vlen_enc) {
6573   assert(VM_Version::supports_avx512bw(), "");
6574   // Byte shuffles are inlane operations and indices are determined using
6575   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6576   // normalized to index range 0-15. This makes sure that all the multiples
6577   // of an index value are placed at same relative position in 128 bit
6578   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6579   // will be 16th element in their respective 128 bit lanes.
6580   movl(rtmp, 16);
6581   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6582 
6583   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6584   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6585   // original shuffle indices and move the shuffled lanes corresponding to true
6586   // mask to destination vector.
6587   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6588   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6589   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6590 
6591   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6592   // and broadcasting second 128 bit lane.
6593   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6594   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6595   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6596   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6597   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6598 
6599   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6600   // and broadcasting third 128 bit lane.
6601   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6602   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6603   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6604   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6605   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6606 
6607   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6608   // and broadcasting third 128 bit lane.
6609   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6610   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6611   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6612   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6613   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6614 }
6615 
6616 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6617                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6618   if (vlen_enc == AVX_128bit) {
6619     vpermilps(dst, src, shuffle, vlen_enc);
6620   } else if (bt == T_INT) {
6621     vpermd(dst, shuffle, src, vlen_enc);
6622   } else {
6623     assert(bt == T_FLOAT, "");
6624     vpermps(dst, shuffle, src, vlen_enc);
6625   }
6626 }
6627 
6628 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6629   switch(opcode) {
6630     case Op_AddHF: vaddsh(dst, src1, src2); break;
6631     case Op_SubHF: vsubsh(dst, src1, src2); break;
6632     case Op_MulHF: vmulsh(dst, src1, src2); break;
6633     case Op_DivHF: vdivsh(dst, src1, src2); break;
6634     default: assert(false, "%s", NodeClassNames[opcode]); break;
6635   }
6636 }
6637 
6638 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6639   switch(elem_bt) {
6640     case T_BYTE:
6641       if (ideal_opc == Op_SaturatingAddV) {
6642         vpaddsb(dst, src1, src2, vlen_enc);
6643       } else {
6644         assert(ideal_opc == Op_SaturatingSubV, "");
6645         vpsubsb(dst, src1, src2, vlen_enc);
6646       }
6647       break;
6648     case T_SHORT:
6649       if (ideal_opc == Op_SaturatingAddV) {
6650         vpaddsw(dst, src1, src2, vlen_enc);
6651       } else {
6652         assert(ideal_opc == Op_SaturatingSubV, "");
6653         vpsubsw(dst, src1, src2, vlen_enc);
6654       }
6655       break;
6656     default:
6657       fatal("Unsupported type %s", type2name(elem_bt));
6658       break;
6659   }
6660 }
6661 
6662 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6663   switch(elem_bt) {
6664     case T_BYTE:
6665       if (ideal_opc == Op_SaturatingAddV) {
6666         vpaddusb(dst, src1, src2, vlen_enc);
6667       } else {
6668         assert(ideal_opc == Op_SaturatingSubV, "");
6669         vpsubusb(dst, src1, src2, vlen_enc);
6670       }
6671       break;
6672     case T_SHORT:
6673       if (ideal_opc == Op_SaturatingAddV) {
6674         vpaddusw(dst, src1, src2, vlen_enc);
6675       } else {
6676         assert(ideal_opc == Op_SaturatingSubV, "");
6677         vpsubusw(dst, src1, src2, vlen_enc);
6678       }
6679       break;
6680     default:
6681       fatal("Unsupported type %s", type2name(elem_bt));
6682       break;
6683   }
6684 }
6685 
6686 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6687                                                               XMMRegister src2, KRegister ktmp, int vlen_enc) {
6688   // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6689   // overflow_mask = Inp1 <u Inp2
6690   evpcmpu(elem_bt, ktmp,  src2, src1, Assembler::lt, vlen_enc);
6691   // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6692   evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6693 }
6694 
6695 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6696                                                               XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6697   // Emulate unsigned comparison using signed comparison
6698   // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6699   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6700   vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6701   vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6702 
6703   vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6704 
6705   // Res = INP1 - INP2 (non-commutative and non-associative)
6706   vpsub(elem_bt, dst, src1, src2, vlen_enc);
6707   // Res = Mask ? Zero : Res
6708   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6709   vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6710 }
6711 
6712 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6713                                                                XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6714   // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6715   // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6716   // Res = Signed Add INP1, INP2
6717   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6718   // T1 = SRC1 | SRC2
6719   vpor(xtmp1, src1, src2, vlen_enc);
6720   // Max_Unsigned = -1
6721   vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6722   // Unsigned compare:  Mask = Res <u T1
6723   evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6724   // res  = Mask ? Max_Unsigned : Res
6725   evpblend(elem_bt, dst, ktmp,  dst, xtmp2, true, vlen_enc);
6726 }
6727 
6728 //
6729 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6730 // unsigned addition operation.
6731 //    overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6732 //
6733 // We empirically determined its semantic equivalence to following reduced expression
6734 //    overflow_mask =  (a + b) <u (a | b)
6735 //
6736 // and also verified it though Alive2 solver.
6737 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6738 //
6739 
6740 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6741                                                               XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6742   // Res = Signed Add INP1, INP2
6743   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6744   // Compute T1 = INP1 | INP2
6745   vpor(xtmp3, src1, src2, vlen_enc);
6746   // T1 = Minimum signed value.
6747   vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6748   // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6749   vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6750   // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6751   vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6752   // Compute overflow detection mask = Res<1> <s T1
6753   if (elem_bt == T_INT) {
6754     vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6755   } else {
6756     assert(elem_bt == T_LONG, "");
6757     vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6758   }
6759   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6760 }
6761 
6762 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6763                                       int vlen_enc, bool xtmp2_hold_M1) {
6764   if (VM_Version::supports_avx512dq()) {
6765     evpmovq2m(ktmp, src, vlen_enc);
6766   } else {
6767     assert(VM_Version::supports_evex(), "");
6768     if (!xtmp2_hold_M1) {
6769       vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6770     }
6771     evpsraq(xtmp1, src, 63, vlen_enc);
6772     evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6773   }
6774 }
6775 
6776 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6777                                       int vlen_enc, bool xtmp2_hold_M1) {
6778   if (VM_Version::supports_avx512dq()) {
6779     evpmovd2m(ktmp, src, vlen_enc);
6780   } else {
6781     assert(VM_Version::supports_evex(), "");
6782     if (!xtmp2_hold_M1) {
6783       vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6784     }
6785     vpsrad(xtmp1, src, 31, vlen_enc);
6786     Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6787   }
6788 }
6789 
6790 
6791 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6792   if (elem_bt == T_LONG) {
6793     if (VM_Version::supports_evex()) {
6794       evpsraq(dst, src, 63, vlen_enc);
6795     } else {
6796       vpsrad(dst, src, 31, vlen_enc);
6797       vpshufd(dst, dst, 0xF5, vlen_enc);
6798     }
6799   } else {
6800     assert(elem_bt == T_INT, "");
6801     vpsrad(dst, src, 31, vlen_enc);
6802   }
6803 }
6804 
6805 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6806   if (compute_allones) {
6807     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6808       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6809     } else {
6810       vpcmpeqq(allones, allones, allones, vlen_enc);
6811     }
6812   }
6813   if (elem_bt == T_LONG) {
6814     vpsrlq(dst, allones, 1, vlen_enc);
6815   } else {
6816     assert(elem_bt == T_INT, "");
6817     vpsrld(dst, allones, 1, vlen_enc);
6818   }
6819 }
6820 
6821 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6822   if (compute_allones) {
6823     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6824       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6825     } else {
6826       vpcmpeqq(allones, allones, allones, vlen_enc);
6827     }
6828   }
6829   if (elem_bt == T_LONG) {
6830     vpsllq(dst, allones, 63, vlen_enc);
6831   } else {
6832     assert(elem_bt == T_INT, "");
6833     vpslld(dst, allones, 31, vlen_enc);
6834   }
6835 }
6836 
6837 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask,  XMMRegister src1, XMMRegister src2,
6838                                 Assembler::ComparisonPredicate cond, int vlen_enc) {
6839   switch(elem_bt) {
6840     case T_LONG:  evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6841     case T_INT:   evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6842     case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6843     case T_BYTE:  evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6844     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6845   }
6846 }
6847 
6848 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6849   switch(elem_bt) {
6850     case  T_LONG:  vpcmpgtq(dst, src1, src2, vlen_enc); break;
6851     case  T_INT:   vpcmpgtd(dst, src1, src2, vlen_enc); break;
6852     case  T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6853     case  T_BYTE:  vpcmpgtb(dst, src1, src2, vlen_enc); break;
6854     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6855   }
6856 }
6857 
6858 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6859                                            XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6860   if (elem_bt == T_LONG) {
6861     evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6862   } else {
6863     assert(elem_bt == T_INT, "");
6864     evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6865   }
6866 }
6867 
6868 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6869                                                          XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6870                                                          KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6871   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6872   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6873   // Overflow detection based on Hacker's delight section 2-13.
6874   if (ideal_opc == Op_SaturatingAddV) {
6875     // res = src1 + src2
6876     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6877     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6878     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6879     vpxor(xtmp1, dst, src1, vlen_enc);
6880     vpxor(xtmp2, dst, src2, vlen_enc);
6881     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6882   } else {
6883     assert(ideal_opc == Op_SaturatingSubV, "");
6884     // res = src1 - src2
6885     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6886     // Overflow occurs when both inputs have opposite polarity and
6887     // result polarity does not comply with first input polarity.
6888     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6889     vpxor(xtmp1, src1, src2, vlen_enc);
6890     vpxor(xtmp2, dst, src1, vlen_enc);
6891     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6892   }
6893 
6894   // Compute overflow detection mask.
6895   evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6896   // Note: xtmp1 hold -1 in all its lanes after above call.
6897 
6898   // Compute mask based on first input polarity.
6899   evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6900 
6901   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6902   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6903 
6904   // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6905   // set bits in first input polarity mask holds a min value.
6906   evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6907   // Blend destination lanes with saturated values using overflow detection mask.
6908   evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6909 }
6910 
6911 
6912 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6913                                                         XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6914                                                         XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6915   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6916   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6917   // Overflow detection based on Hacker's delight section 2-13.
6918   if (ideal_opc == Op_SaturatingAddV) {
6919     // res = src1 + src2
6920     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6921     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6922     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6923     vpxor(xtmp1, dst, src1, vlen_enc);
6924     vpxor(xtmp2, dst, src2, vlen_enc);
6925     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6926   } else {
6927     assert(ideal_opc == Op_SaturatingSubV, "");
6928     // res = src1 - src2
6929     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6930     // Overflow occurs when both inputs have opposite polarity and
6931     // result polarity does not comply with first input polarity.
6932     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6933     vpxor(xtmp1, src1, src2, vlen_enc);
6934     vpxor(xtmp2, dst, src1, vlen_enc);
6935     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6936   }
6937 
6938   // Sign-extend to compute overflow detection mask.
6939   vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
6940 
6941   vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
6942   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
6943   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6944 
6945   // Compose saturating min/max vector using first input polarity mask.
6946   vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
6947   vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
6948 
6949   // Blend result with saturating vector using overflow detection mask.
6950   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6951 }
6952 
6953 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6954   switch(elem_bt) {
6955     case T_BYTE:
6956       if (ideal_opc == Op_SaturatingAddV) {
6957         vpaddsb(dst, src1, src2, vlen_enc);
6958       } else {
6959         assert(ideal_opc == Op_SaturatingSubV, "");
6960         vpsubsb(dst, src1, src2, vlen_enc);
6961       }
6962       break;
6963     case T_SHORT:
6964       if (ideal_opc == Op_SaturatingAddV) {
6965         vpaddsw(dst, src1, src2, vlen_enc);
6966       } else {
6967         assert(ideal_opc == Op_SaturatingSubV, "");
6968         vpsubsw(dst, src1, src2, vlen_enc);
6969       }
6970       break;
6971     default:
6972       fatal("Unsupported type %s", type2name(elem_bt));
6973       break;
6974   }
6975 }
6976 
6977 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6978   switch(elem_bt) {
6979     case T_BYTE:
6980       if (ideal_opc == Op_SaturatingAddV) {
6981         vpaddusb(dst, src1, src2, vlen_enc);
6982       } else {
6983         assert(ideal_opc == Op_SaturatingSubV, "");
6984         vpsubusb(dst, src1, src2, vlen_enc);
6985       }
6986       break;
6987     case T_SHORT:
6988       if (ideal_opc == Op_SaturatingAddV) {
6989         vpaddusw(dst, src1, src2, vlen_enc);
6990       } else {
6991         assert(ideal_opc == Op_SaturatingSubV, "");
6992         vpsubusw(dst, src1, src2, vlen_enc);
6993       }
6994       break;
6995     default:
6996       fatal("Unsupported type %s", type2name(elem_bt));
6997       break;
6998   }
6999 }
7000 
7001 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
7002                                                      XMMRegister src2, int vlen_enc) {
7003   switch(elem_bt) {
7004     case T_BYTE:
7005       evpermi2b(dst, src1, src2, vlen_enc);
7006       break;
7007     case T_SHORT:
7008       evpermi2w(dst, src1, src2, vlen_enc);
7009       break;
7010     case T_INT:
7011       evpermi2d(dst, src1, src2, vlen_enc);
7012       break;
7013     case T_LONG:
7014       evpermi2q(dst, src1, src2, vlen_enc);
7015       break;
7016     case T_FLOAT:
7017       evpermi2ps(dst, src1, src2, vlen_enc);
7018       break;
7019     case T_DOUBLE:
7020       evpermi2pd(dst, src1, src2, vlen_enc);
7021       break;
7022     default:
7023       fatal("Unsupported type %s", type2name(elem_bt));
7024       break;
7025   }
7026 }
7027 
7028 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
7029   if (is_unsigned) {
7030     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7031   } else {
7032     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7033   }
7034 }
7035 
7036 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
7037   if (is_unsigned) {
7038     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7039   } else {
7040     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7041   }
7042 }
7043 
7044 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
7045   switch(opcode) {
7046     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7047     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7048     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7049     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7050     default: assert(false, "%s", NodeClassNames[opcode]); break;
7051   }
7052 }
7053 
7054 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7055   switch(opcode) {
7056     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7057     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7058     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7059     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7060     default: assert(false, "%s", NodeClassNames[opcode]); break;
7061   }
7062 }
7063 
7064 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7065                                             KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) {
7066   vector_max_min_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit);
7067 }
7068 
7069 void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7070                                             KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
7071   if (opcode == Op_MaxVHF || opcode == Op_MaxHF) {
7072     // Move sign bits of src2 to mask register.
7073     evpmovw2m(ktmp, src2, vlen_enc);
7074     // xtmp1 = src2 < 0 ? src2 : src1
7075     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7076     // xtmp2 = src2 < 0 ? ? src1 : src2
7077     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7078     // Idea behind above swapping is to make seconds source operand a +ve value.
7079     // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in
7080     // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction,
7081     // the second source operand, either a NaN or a valid floating-point value, is returned
7082     // dst = max(xtmp1, xtmp2)
7083     evmaxph(dst, xtmp1, xtmp2, vlen_enc);
7084     // isNaN = is_unordered_quiet(xtmp1)
7085     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7086     // Final result is same as first source if its a NaN value,
7087     // in case second operand holds a NaN value then as per above semantics
7088     // result is same as second operand.
7089     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7090   } else {
7091     assert(opcode == Op_MinVHF || opcode == Op_MinHF, "");
7092     // Move sign bits of src1 to mask register.
7093     evpmovw2m(ktmp, src1, vlen_enc);
7094     // xtmp1 = src1 < 0 ? src2 : src1
7095     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7096     // xtmp2 = src1 < 0 ? src1 : src2
7097     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7098     // Idea behind above swapping is to make seconds source operand a -ve value.
7099     // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in
7100     // the second source operand is returned.
7101     // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN
7102     // or a valid floating-point value, is written to the result.
7103     // dst = min(xtmp1, xtmp2)
7104     evminph(dst, xtmp1, xtmp2, vlen_enc);
7105     // isNaN = is_unordered_quiet(xtmp1)
7106     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7107     // Final result is same as first source if its a NaN value,
7108     // in case second operand holds a NaN value then as per above semantics
7109     // result is same as second operand.
7110     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7111   }
7112 }