1 /*
   2  * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "../../share/runtime/globals.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "oops/methodData.hpp"
  31 #include "opto/c2_MacroAssembler.hpp"
  32 #include "opto/intrinsicnode.hpp"
  33 #include "opto/output.hpp"
  34 #include "opto/opcodes.hpp"
  35 #include "opto/subnode.hpp"
  36 #include "runtime/globals.hpp"
  37 #include "runtime/objectMonitor.hpp"
  38 #include "runtime/objectMonitorTable.hpp"
  39 #include "runtime/stubRoutines.hpp"
  40 #include "runtime/synchronizer.hpp"
  41 #include "utilities/checkedCast.hpp"
  42 #include "utilities/globalDefinitions.hpp"
  43 #include "utilities/powerOfTwo.hpp"
  44 #include "utilities/sizes.hpp"
  45 
  46 #ifdef PRODUCT
  47 #define BLOCK_COMMENT(str) /* nothing */
  48 #define STOP(error) stop(error)
  49 #else
  50 #define BLOCK_COMMENT(str) block_comment(str)
  51 #define STOP(error) block_comment(error); stop(error)
  52 #endif
  53 
  54 // C2 compiled method's prolog code.
  55 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
  56   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  57 
  58   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  59   // Remove word for return addr
  60   framesize -= wordSize;
  61   stack_bang_size -= wordSize;
  62 
  63   // Calls to C2R adapters often do not accept exceptional returns.
  64   // We require that their callers must bang for them.  But be careful, because
  65   // some VM calls (such as call site linkage) can use several kilobytes of
  66   // stack.  But the stack safety zone should account for that.
  67   // See bugs 4446381, 4468289, 4497237.
  68   if (stack_bang_size > 0) {
  69     generate_stack_overflow_check(stack_bang_size);
  70 
  71     // We always push rbp, so that on return to interpreter rbp, will be
  72     // restored correctly and we can correct the stack.
  73     push(rbp);
  74     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  75     if (PreserveFramePointer) {
  76       mov(rbp, rsp);
  77     }
  78     // Remove word for ebp
  79     framesize -= wordSize;
  80 
  81     // Create frame
  82     if (framesize) {
  83       subptr(rsp, framesize);
  84     }
  85   } else {
  86     subptr(rsp, framesize);
  87 
  88     // Save RBP register now.
  89     framesize -= wordSize;
  90     movptr(Address(rsp, framesize), rbp);
  91     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  92     if (PreserveFramePointer) {
  93       movptr(rbp, rsp);
  94       if (framesize > 0) {
  95         addptr(rbp, framesize);
  96       }
  97     }
  98   }
  99 
 100   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 101     framesize -= wordSize;
 102     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 103   }
 104 
 105 #ifdef ASSERT
 106   if (VerifyStackAtCalls) {
 107     Label L;
 108     push(rax);
 109     mov(rax, rsp);
 110     andptr(rax, StackAlignmentInBytes-1);
 111     cmpptr(rax, StackAlignmentInBytes-wordSize);
 112     pop(rax);
 113     jcc(Assembler::equal, L);
 114     STOP("Stack is not properly aligned!");
 115     bind(L);
 116   }
 117 #endif
 118 
 119   if (!is_stub) {
 120     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 121     // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 122     Label dummy_slow_path;
 123     Label dummy_continuation;
 124     Label* slow_path = &dummy_slow_path;
 125     Label* continuation = &dummy_continuation;
 126     if (!Compile::current()->output()->in_scratch_emit_size()) {
 127       // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 128       C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 129       Compile::current()->output()->add_stub(stub);
 130       slow_path = &stub->entry();
 131       continuation = &stub->continuation();
 132     }
 133     bs->nmethod_entry_barrier(this, slow_path, continuation);
 134   }
 135 }
 136 
 137 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 138   switch (vlen_in_bytes) {
 139     case  4: // fall-through
 140     case  8: // fall-through
 141     case 16: return Assembler::AVX_128bit;
 142     case 32: return Assembler::AVX_256bit;
 143     case 64: return Assembler::AVX_512bit;
 144 
 145     default: {
 146       ShouldNotReachHere();
 147       return Assembler::AVX_NoVec;
 148     }
 149   }
 150 }
 151 
 152 // fast_lock and fast_unlock used by C2
 153 
 154 // Because the transitions from emitted code to the runtime
 155 // monitorenter/exit helper stubs are so slow it's critical that
 156 // we inline both the lock-stack fast path and the inflated fast path.
 157 //
 158 // See also: cmpFastLock and cmpFastUnlock.
 159 //
 160 // What follows is a specialized inline transliteration of the code
 161 // in enter() and exit(). If we're concerned about I$ bloat another
 162 // option would be to emit TrySlowEnter and TrySlowExit methods
 163 // at startup-time.  These methods would accept arguments as
 164 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 165 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 166 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 167 // In practice, however, the # of lock sites is bounded and is usually small.
 168 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 169 // if the processor uses simple bimodal branch predictors keyed by EIP
 170 // Since the helper routines would be called from multiple synchronization
 171 // sites.
 172 //
 173 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 174 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 175 // to those specialized methods.  That'd give us a mostly platform-independent
 176 // implementation that the JITs could optimize and inline at their pleasure.
 177 // Done correctly, the only time we'd need to cross to native could would be
 178 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 179 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 180 // (b) explicit barriers or fence operations.
 181 //
 182 // TODO:
 183 //
 184 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 185 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 186 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 187 //    the lock operators would typically be faster than reifying Self.
 188 //
 189 // *  Ideally I'd define the primitives as:
 190 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 191 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 192 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 193 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 194 //    Furthermore the register assignments are overconstrained, possibly resulting in
 195 //    sub-optimal code near the synchronization site.
 196 //
 197 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 198 //    Alternately, use a better sp-proximity test.
 199 //
 200 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 201 //    Either one is sufficient to uniquely identify a thread.
 202 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 203 //
 204 // *  Intrinsify notify() and notifyAll() for the common cases where the
 205 //    object is locked by the calling thread but the waitlist is empty.
 206 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 207 //
 208 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 209 //    But beware of excessive branch density on AMD Opterons.
 210 //
 211 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 212 //    or failure of the fast path.  If the fast path fails then we pass
 213 //    control to the slow path, typically in C.  In fast_lock and
 214 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 215 //    will emit a conditional branch immediately after the node.
 216 //    So we have branches to branches and lots of ICC.ZF games.
 217 //    Instead, it might be better to have C2 pass a "FailureLabel"
 218 //    into fast_lock and fast_unlock.  In the case of success, control
 219 //    will drop through the node.  ICC.ZF is undefined at exit.
 220 //    In the case of failure, the node will branch directly to the
 221 //    FailureLabel
 222 
 223 // obj: object to lock
 224 // box: on-stack box address -- KILLED
 225 // rax: tmp -- KILLED
 226 // t  : tmp -- KILLED
 227 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register rax_reg,
 228                                   Register t, Register thread) {
 229   assert(rax_reg == rax, "Used for CAS");
 230   assert_different_registers(obj, box, rax_reg, t, thread);
 231 
 232   // Handle inflated monitor.
 233   Label inflated;
 234   // Finish fast lock successfully. ZF value is irrelevant.
 235   Label locked;
 236   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 237   Label slow_path;
 238 
 239   if (UseObjectMonitorTable) {
 240     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 241     movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
 242   }
 243 
 244   if (DiagnoseSyncOnValueBasedClasses != 0) {
 245     load_klass(rax_reg, obj, t);
 246     testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 247     jcc(Assembler::notZero, slow_path);
 248   }
 249 
 250   const Register mark = t;
 251 
 252   { // Fast Lock
 253 
 254     Label push;
 255 
 256     const Register top = UseObjectMonitorTable ? rax_reg : box;
 257 
 258     // Load the mark.
 259     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 260 
 261     // Prefetch top.
 262     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 263 
 264     // Check for monitor (0b10).
 265     testptr(mark, markWord::monitor_value);
 266     jcc(Assembler::notZero, inflated);
 267 
 268     // Check if lock-stack is full.
 269     cmpl(top, LockStack::end_offset() - 1);
 270     jcc(Assembler::greater, slow_path);
 271 
 272     // Check if recursive.
 273     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 274     jccb(Assembler::equal, push);
 275 
 276     // Try to lock. Transition lock bits 0b01 => 0b00
 277     movptr(rax_reg, mark);
 278     orptr(rax_reg, markWord::unlocked_value);
 279     andptr(mark, ~(int32_t)markWord::unlocked_value);
 280     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 281     jcc(Assembler::notEqual, slow_path);
 282 
 283     if (UseObjectMonitorTable) {
 284       // Need to reload top, clobbered by CAS.
 285       movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 286     }
 287     bind(push);
 288     // After successful lock, push object on lock-stack.
 289     movptr(Address(thread, top), obj);
 290     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 291     jmp(locked);
 292   }
 293 
 294   { // Handle inflated monitor.
 295     bind(inflated);
 296 
 297     const Register monitor = t;
 298 
 299     if (!UseObjectMonitorTable) {
 300       assert(mark == monitor, "should be the same here");
 301     } else {
 302       const Register hash = t;
 303       Label monitor_found;
 304 
 305       // Look for the monitor in the om_cache.
 306 
 307       ByteSize cache_offset   = JavaThread::om_cache_oops_offset();
 308       ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
 309       const int num_unrolled  = OMCache::CAPACITY;
 310       for (int i = 0; i < num_unrolled; i++) {
 311         movptr(monitor, Address(thread,  cache_offset + monitor_offset));
 312         cmpptr(obj, Address(thread, cache_offset));
 313         jccb(Assembler::equal, monitor_found);
 314         cache_offset = cache_offset + OMCache::oop_to_oop_difference();
 315       }
 316 
 317       if (UseCompactObjectHeaders) {
 318         // TODO: The fast-path table lookup currently doesn't work with Lilliput's
 319         // compact identity-hashcode implementation.
 320         // See: https://bugs.openjdk.org/browse/JDK-8380981
 321         jmp(slow_path);
 322       } else {
 323         // Look for the monitor in the table.
 324 
 325         // Get the hash code.
 326         movptr(hash, Address(obj, oopDesc::mark_offset_in_bytes()));
 327         shrq(hash, markWord::hash_shift);
 328         andq(hash, markWord::hash_mask);
 329 
 330         // Get the table and calculate the bucket's address.
 331         lea(rax_reg, ExternalAddress(ObjectMonitorTable::current_table_address()));
 332         movptr(rax_reg, Address(rax_reg));
 333         andq(hash, Address(rax_reg, ObjectMonitorTable::table_capacity_mask_offset()));
 334         movptr(rax_reg, Address(rax_reg, ObjectMonitorTable::table_buckets_offset()));
 335 
 336         // Read the monitor from the bucket.
 337         movptr(monitor, Address(rax_reg, hash, Address::times_ptr));
 338 
 339         // Check if the monitor in the bucket is special (empty, tombstone or removed)
 340         cmpptr(monitor, ObjectMonitorTable::SpecialPointerValues::below_is_special);
 341         jcc(Assembler::below, slow_path);
 342 
 343         // Check if object matches.
 344         movptr(rax_reg, Address(monitor, ObjectMonitor::object_offset()));
 345         BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 346         bs_asm->try_peek_weak_handle_in_nmethod(this, rax_reg, rax_reg, slow_path);
 347         cmpptr(rax_reg, obj);
 348         jcc(Assembler::notEqual, slow_path);
 349       }
 350       bind(monitor_found);
 351     }
 352     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 353     const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 354     const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
 355 
 356     Label monitor_locked;
 357     // Lock the monitor.
 358 
 359     if (UseObjectMonitorTable) {
 360       // Cache the monitor for unlock before trashing box. On failure to acquire
 361       // the lock, the slow path will reset the entry accordingly (see CacheSetter).
 362       movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
 363     }
 364 
 365     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 366     xorptr(rax_reg, rax_reg);
 367     movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
 368     lock(); cmpxchgptr(box, owner_address);
 369     jccb(Assembler::equal, monitor_locked);
 370 
 371     // Check if recursive.
 372     cmpptr(box, rax_reg);
 373     jccb(Assembler::notEqual, slow_path);
 374 
 375     // Recursive.
 376     increment(recursions_address);
 377 
 378     bind(monitor_locked);
 379   }
 380 
 381   bind(locked);
 382   // Set ZF = 1
 383   xorl(rax_reg, rax_reg);
 384 
 385 #ifdef ASSERT
 386   // Check that locked label is reached with ZF set.
 387   Label zf_correct;
 388   Label zf_bad_zero;
 389   jcc(Assembler::zero, zf_correct);
 390   jmp(zf_bad_zero);
 391 #endif
 392 
 393   bind(slow_path);
 394 #ifdef ASSERT
 395   // Check that slow_path label is reached with ZF not set.
 396   jcc(Assembler::notZero, zf_correct);
 397   stop("Fast Lock ZF != 0");
 398   bind(zf_bad_zero);
 399   stop("Fast Lock ZF != 1");
 400   bind(zf_correct);
 401 #endif
 402   // C2 uses the value of ZF to determine the continuation.
 403 }
 404 
 405 // obj: object to lock
 406 // rax: tmp -- KILLED
 407 // t  : tmp - cannot be obj nor rax -- KILLED
 408 //
 409 // Some commentary on balanced locking:
 410 //
 411 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 412 // Methods that don't have provably balanced locking are forced to run in the
 413 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 414 // The interpreter provides two properties:
 415 // I1:  At return-time the interpreter automatically and quietly unlocks any
 416 //      objects acquired in the current activation (frame).  Recall that the
 417 //      interpreter maintains an on-stack list of locks currently held by
 418 //      a frame.
 419 // I2:  If a method attempts to unlock an object that is not held by the
 420 //      frame the interpreter throws IMSX.
 421 //
 422 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 423 // B() doesn't have provably balanced locking so it runs in the interpreter.
 424 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 425 // is still locked by A().
 426 //
 427 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface
 428 // Specification" states that an object locked by JNI's MonitorEnter should not be
 429 // unlocked by "normal" java-level locking and vice-versa.  The specification doesn't
 430 // specify what will occur if a program engages in such mixed-mode locking, however.
 431 // Arguably given that the spec legislates the JNI case as undefined our implementation
 432 // could reasonably *avoid* checking owner in fast_unlock().
 433 // In the interest of performance we elide m->Owner==Self check in unlock.
 434 // A perfectly viable alternative is to elide the owner check except when
 435 // Xcheck:jni is enabled.
 436 
 437 void C2_MacroAssembler::fast_unlock(Register obj, Register reg_rax, Register t, Register thread) {
 438   assert(reg_rax == rax, "Used for CAS");
 439   assert_different_registers(obj, reg_rax, t);
 440 
 441   // Handle inflated monitor.
 442   Label inflated, inflated_check_lock_stack;
 443   // Finish fast unlock successfully.  MUST jump with ZF == 1
 444   Label unlocked, slow_path;
 445 
 446   const Register mark = t;
 447   const Register monitor = t;
 448   const Register top = UseObjectMonitorTable ? t : reg_rax;
 449   const Register box = reg_rax;
 450 
 451   Label dummy;
 452   C2FastUnlockStub* stub = nullptr;
 453 
 454   if (!Compile::current()->output()->in_scratch_emit_size()) {
 455     stub = new (Compile::current()->comp_arena()) C2FastUnlockStub(obj, mark, reg_rax, thread);
 456     Compile::current()->output()->add_stub(stub);
 457   }
 458 
 459   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 460 
 461   { // Fast Unlock
 462 
 463     // Load top.
 464     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 465 
 466     if (!UseObjectMonitorTable) {
 467       // Prefetch mark.
 468       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 469     }
 470 
 471     // Check if obj is top of lock-stack.
 472     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 473     // Top of lock stack was not obj. Must be monitor.
 474     jcc(Assembler::notEqual, inflated_check_lock_stack);
 475 
 476     // Pop lock-stack.
 477     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 478     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 479 
 480     // Check if recursive.
 481     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 482     jcc(Assembler::equal, unlocked);
 483 
 484     // We elide the monitor check, let the CAS fail instead.
 485 
 486     if (UseObjectMonitorTable) {
 487       // Load mark.
 488       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 489     }
 490 
 491     // Try to unlock. Transition lock bits 0b00 => 0b01
 492     movptr(reg_rax, mark);
 493     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
 494     orptr(mark, markWord::unlocked_value);
 495     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 496     jcc(Assembler::notEqual, push_and_slow_path);
 497     jmp(unlocked);
 498   }
 499 
 500 
 501   { // Handle inflated monitor.
 502     bind(inflated_check_lock_stack);
 503 #ifdef ASSERT
 504     Label check_done;
 505     subl(top, oopSize);
 506     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
 507     jcc(Assembler::below, check_done);
 508     cmpptr(obj, Address(thread, top));
 509     jcc(Assembler::notEqual, inflated_check_lock_stack);
 510     stop("Fast Unlock lock on stack");
 511     bind(check_done);
 512     if (UseObjectMonitorTable) {
 513       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 514     }
 515     testptr(mark, markWord::monitor_value);
 516     jcc(Assembler::notZero, inflated);
 517     stop("Fast Unlock not monitor");
 518 #endif
 519 
 520     bind(inflated);
 521 
 522     if (!UseObjectMonitorTable) {
 523       assert(mark == monitor, "should be the same here");
 524     } else {
 525       // Uses ObjectMonitorTable.  Look for the monitor in our BasicLock on the stack.
 526       movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 527       // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
 528       cmpptr(monitor, alignof(ObjectMonitor*));
 529       jcc(Assembler::below, slow_path);
 530     }
 531     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 532     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
 533     const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
 534     const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag};
 535     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
 536 
 537     Label recursive;
 538 
 539     // Check if recursive.
 540     cmpptr(recursions_address, 0);
 541     jcc(Assembler::notZero, recursive);
 542 
 543     // Set owner to null.
 544     // Release to satisfy the JMM
 545     movptr(owner_address, NULL_WORD);
 546     // We need a full fence after clearing owner to avoid stranding.
 547     // StoreLoad achieves this.
 548     membar(StoreLoad);
 549 
 550     // Check if the entry_list is empty.
 551     cmpptr(entry_list_address, NULL_WORD);
 552     jcc(Assembler::zero, unlocked);    // If so we are done.
 553 
 554     // Check if there is a successor.
 555     cmpptr(succ_address, NULL_WORD);
 556     jcc(Assembler::notZero, unlocked); // If so we are done.
 557 
 558     // Save the monitor pointer in the current thread, so we can try to
 559     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 560     if (!UseObjectMonitorTable) {
 561       andptr(monitor, ~(int32_t)markWord::monitor_value);
 562     }
 563     movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
 564 
 565     orl(t, 1); // Fast Unlock ZF = 0
 566     jmpb(slow_path);
 567 
 568     // Recursive unlock.
 569     bind(recursive);
 570     decrement(recursions_address);
 571   }
 572 
 573   bind(unlocked);
 574   xorl(t, t); // Fast Unlock ZF = 1
 575 
 576 #ifdef ASSERT
 577   // Check that unlocked label is reached with ZF set.
 578   Label zf_correct;
 579   Label zf_bad_zero;
 580   jcc(Assembler::zero, zf_correct);
 581   jmp(zf_bad_zero);
 582 #endif
 583 
 584   bind(slow_path);
 585   if (stub != nullptr) {
 586     bind(stub->slow_path_continuation());
 587   }
 588 #ifdef ASSERT
 589   // Check that stub->continuation() label is reached with ZF not set.
 590   jcc(Assembler::notZero, zf_correct);
 591   stop("Fast Unlock ZF != 0");
 592   bind(zf_bad_zero);
 593   stop("Fast Unlock ZF != 1");
 594   bind(zf_correct);
 595 #endif
 596   // C2 uses the value of ZF to determine the continuation.
 597 }
 598 
 599 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
 600   fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
 601 }
 602 
 603 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) {
 604   const int framesize = Compile::current()->output()->frame_size_in_bytes();
 605   masm->movptr(dst, rsp);
 606   if (framesize > 2 * wordSize) {
 607     masm->addptr(dst, framesize - 2 * wordSize);
 608   }
 609 }
 610 
 611 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
 612   if (PreserveFramePointer) {
 613     // frame pointer is valid
 614 #ifdef ASSERT
 615     // Verify frame pointer value in rbp.
 616     reconstruct_frame_pointer_helper(this, rtmp);
 617     Label L_success;
 618     cmpq(rbp, rtmp);
 619     jccb(Assembler::equal, L_success);
 620     STOP("frame pointer mismatch");
 621     bind(L_success);
 622 #endif // ASSERT
 623   } else {
 624     reconstruct_frame_pointer_helper(this, rbp);
 625   }
 626 }
 627 
 628 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) {
 629   jint lo = t->_lo;
 630   jint hi = t->_hi;
 631   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi);
 632   if (t == TypeInt::INT) {
 633     return;
 634   }
 635 
 636   BLOCK_COMMENT("CastII {");
 637   Label fail;
 638   Label succeed;
 639 
 640   if (lo != min_jint) {
 641     cmpl(val, lo);
 642     jccb(Assembler::less, fail);
 643   }
 644   if (hi != max_jint) {
 645     cmpl(val, hi);
 646     jccb(Assembler::greater, fail);
 647   }
 648   jmpb(succeed);
 649 
 650   bind(fail);
 651   movl(c_rarg0, idx);
 652   movl(c_rarg1, val);
 653   movl(c_rarg2, lo);
 654   movl(c_rarg3, hi);
 655   reconstruct_frame_pointer(rscratch1);
 656   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range)));
 657   hlt();
 658   bind(succeed);
 659   BLOCK_COMMENT("} // CastII");
 660 }
 661 
 662 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
 663   fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
 664 }
 665 
 666 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) {
 667   jlong lo = t->_lo;
 668   jlong hi = t->_hi;
 669   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi);
 670   if (t == TypeLong::LONG) {
 671     return;
 672   }
 673 
 674   BLOCK_COMMENT("CastLL {");
 675   Label fail;
 676   Label succeed;
 677 
 678   auto cmp_val = [&](jlong bound) {
 679     if (is_simm32(bound)) {
 680       cmpq(val, checked_cast<int>(bound));
 681     } else {
 682       mov64(tmp, bound);
 683       cmpq(val, tmp);
 684     }
 685   };
 686 
 687   if (lo != min_jlong) {
 688     cmp_val(lo);
 689     jccb(Assembler::less, fail);
 690   }
 691   if (hi != max_jlong) {
 692     cmp_val(hi);
 693     jccb(Assembler::greater, fail);
 694   }
 695   jmpb(succeed);
 696 
 697   bind(fail);
 698   movl(c_rarg0, idx);
 699   movq(c_rarg1, val);
 700   mov64(c_rarg2, lo);
 701   mov64(c_rarg3, hi);
 702   reconstruct_frame_pointer(rscratch1);
 703   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range)));
 704   hlt();
 705   bind(succeed);
 706   BLOCK_COMMENT("} // CastLL");
 707 }
 708 
 709 //-------------------------------------------------------------------------------------------
 710 // Generic instructions support for use in .ad files C2 code generation
 711 
 712 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 713   if (dst != src) {
 714     movdqu(dst, src);
 715   }
 716   if (opcode == Op_AbsVD) {
 717     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 718   } else {
 719     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 720     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 721   }
 722 }
 723 
 724 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 725   if (opcode == Op_AbsVD) {
 726     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 727   } else {
 728     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 729     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 730   }
 731 }
 732 
 733 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 734   if (dst != src) {
 735     movdqu(dst, src);
 736   }
 737   if (opcode == Op_AbsVF) {
 738     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 739   } else {
 740     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 741     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 742   }
 743 }
 744 
 745 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 746   if (opcode == Op_AbsVF) {
 747     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 748   } else {
 749     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 750     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 751   }
 752 }
 753 
 754 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 755   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 756   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 757 
 758   if (opcode == Op_MinV) {
 759     if (elem_bt == T_BYTE) {
 760       pminsb(dst, src);
 761     } else if (elem_bt == T_SHORT) {
 762       pminsw(dst, src);
 763     } else if (elem_bt == T_INT) {
 764       pminsd(dst, src);
 765     } else {
 766       assert(elem_bt == T_LONG, "required");
 767       assert(tmp == xmm0, "required");
 768       assert_different_registers(dst, src, tmp);
 769       movdqu(xmm0, dst);
 770       pcmpgtq(xmm0, src);
 771       blendvpd(dst, src);  // xmm0 as mask
 772     }
 773   } else { // opcode == Op_MaxV
 774     if (elem_bt == T_BYTE) {
 775       pmaxsb(dst, src);
 776     } else if (elem_bt == T_SHORT) {
 777       pmaxsw(dst, src);
 778     } else if (elem_bt == T_INT) {
 779       pmaxsd(dst, src);
 780     } else {
 781       assert(elem_bt == T_LONG, "required");
 782       assert(tmp == xmm0, "required");
 783       assert_different_registers(dst, src, tmp);
 784       movdqu(xmm0, src);
 785       pcmpgtq(xmm0, dst);
 786       blendvpd(dst, src);  // xmm0 as mask
 787     }
 788   }
 789 }
 790 
 791 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 792                                   XMMRegister src1, Address src2, int vlen_enc) {
 793   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 794   if (opcode == Op_UMinV) {
 795     switch(elem_bt) {
 796       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 797       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 798       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 799       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 800       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 801     }
 802   } else {
 803     assert(opcode == Op_UMaxV, "required");
 804     switch(elem_bt) {
 805       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 806       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 807       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 808       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 809       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 810     }
 811   }
 812 }
 813 
 814 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
 815   // For optimality, leverage a full vector width of 512 bits
 816   // for operations over smaller vector sizes on AVX512 targets.
 817   if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 818     if (opcode == Op_UMaxV) {
 819       evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 820     } else {
 821       assert(opcode == Op_UMinV, "required");
 822       evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 823     }
 824   } else {
 825     // T1 = -1
 826     vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
 827     // T1 = -1 << 63
 828     vpsllq(xtmp1, xtmp1, 63, vlen_enc);
 829     // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
 830     vpaddq(xtmp2, xtmp1, src2, vlen_enc);
 831     // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
 832     vpaddq(xtmp1, xtmp1, src1, vlen_enc);
 833     // Mask = T2 > T1
 834     vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
 835     if (opcode == Op_UMaxV) {
 836       // Res = Mask ? Src2 : Src1
 837       vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
 838     } else {
 839       // Res = Mask ? Src1 : Src2
 840       vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
 841     }
 842   }
 843 }
 844 
 845 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 846                                   XMMRegister src1, XMMRegister src2, int vlen_enc) {
 847   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 848   if (opcode == Op_UMinV) {
 849     switch(elem_bt) {
 850       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 851       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 852       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 853       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 854       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 855     }
 856   } else {
 857     assert(opcode == Op_UMaxV, "required");
 858     switch(elem_bt) {
 859       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 860       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 861       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 862       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 863       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 864     }
 865   }
 866 }
 867 
 868 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 869                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 870                                  int vlen_enc) {
 871   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 872 
 873   if (opcode == Op_MinV) {
 874     if (elem_bt == T_BYTE) {
 875       vpminsb(dst, src1, src2, vlen_enc);
 876     } else if (elem_bt == T_SHORT) {
 877       vpminsw(dst, src1, src2, vlen_enc);
 878     } else if (elem_bt == T_INT) {
 879       vpminsd(dst, src1, src2, vlen_enc);
 880     } else {
 881       assert(elem_bt == T_LONG, "required");
 882       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 883         vpminsq(dst, src1, src2, vlen_enc);
 884       } else {
 885         assert_different_registers(dst, src1, src2);
 886         vpcmpgtq(dst, src1, src2, vlen_enc);
 887         vblendvpd(dst, src1, src2, dst, vlen_enc);
 888       }
 889     }
 890   } else { // opcode == Op_MaxV
 891     if (elem_bt == T_BYTE) {
 892       vpmaxsb(dst, src1, src2, vlen_enc);
 893     } else if (elem_bt == T_SHORT) {
 894       vpmaxsw(dst, src1, src2, vlen_enc);
 895     } else if (elem_bt == T_INT) {
 896       vpmaxsd(dst, src1, src2, vlen_enc);
 897     } else {
 898       assert(elem_bt == T_LONG, "required");
 899       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 900         vpmaxsq(dst, src1, src2, vlen_enc);
 901       } else {
 902         assert_different_registers(dst, src1, src2);
 903         vpcmpgtq(dst, src1, src2, vlen_enc);
 904         vblendvpd(dst, src2, src1, dst, vlen_enc);
 905       }
 906     }
 907   }
 908 }
 909 
 910 // Float/Double min max
 911 
 912 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
 913                                    XMMRegister dst, XMMRegister a, XMMRegister b,
 914                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
 915                                    int vlen_enc) {
 916   assert(UseAVX > 0, "required");
 917   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 918          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 919   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 920   assert_different_registers(a, tmp, atmp, btmp);
 921   assert_different_registers(b, tmp, atmp, btmp);
 922 
 923   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 924   bool is_double_word = is_double_word_type(elem_bt);
 925 
 926   /* Note on 'non-obvious' assembly sequence:
 927    *
 928    * While there are vminps/vmaxps instructions, there are two important differences between hardware
 929    * and Java on how they handle floats:
 930    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
 931    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
 932    *
 933    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
 934    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
 935    *                (only useful when signs differ, noop otherwise)
 936    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
 937 
 938    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
 939    *   btmp = (b < +0.0) ? a : b
 940    *   atmp = (b < +0.0) ? b : a
 941    *   Tmp  = Max_Float(atmp , btmp)
 942    *   Res  = (atmp == NaN) ? atmp : Tmp
 943    */
 944 
 945   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
 946   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
 947   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
 948   XMMRegister mask;
 949 
 950   if (!is_double_word && is_min) {
 951     mask = a;
 952     vblend = &MacroAssembler::vblendvps;
 953     vmaxmin = &MacroAssembler::vminps;
 954     vcmp = &MacroAssembler::vcmpps;
 955   } else if (!is_double_word && !is_min) {
 956     mask = b;
 957     vblend = &MacroAssembler::vblendvps;
 958     vmaxmin = &MacroAssembler::vmaxps;
 959     vcmp = &MacroAssembler::vcmpps;
 960   } else if (is_double_word && is_min) {
 961     mask = a;
 962     vblend = &MacroAssembler::vblendvpd;
 963     vmaxmin = &MacroAssembler::vminpd;
 964     vcmp = &MacroAssembler::vcmppd;
 965   } else {
 966     assert(is_double_word && !is_min, "sanity");
 967     mask = b;
 968     vblend = &MacroAssembler::vblendvpd;
 969     vmaxmin = &MacroAssembler::vmaxpd;
 970     vcmp = &MacroAssembler::vcmppd;
 971   }
 972 
 973   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
 974   XMMRegister maxmin, scratch;
 975   if (dst == btmp) {
 976     maxmin = btmp;
 977     scratch = tmp;
 978   } else {
 979     maxmin = tmp;
 980     scratch = btmp;
 981   }
 982 
 983   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
 984   if (precompute_mask && !is_double_word) {
 985     vpsrad(tmp, mask, 32, vlen_enc);
 986     mask = tmp;
 987   } else if (precompute_mask && is_double_word) {
 988     vpxor(tmp, tmp, tmp, vlen_enc);
 989     vpcmpgtq(tmp, tmp, mask, vlen_enc);
 990     mask = tmp;
 991   }
 992 
 993   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
 994   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
 995   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
 996   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 997   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
 998 }
 999 
1000 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1001                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1002                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1003                                     int vlen_enc) {
1004   assert(UseAVX > 2, "required");
1005   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1006          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1007   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1008   assert_different_registers(dst, a, atmp, btmp);
1009   assert_different_registers(dst, b, atmp, btmp);
1010 
1011   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1012   bool is_double_word = is_double_word_type(elem_bt);
1013   bool merge = true;
1014 
1015   if (!is_double_word && is_min) {
1016     evpmovd2m(ktmp, a, vlen_enc);
1017     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1018     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1019     vminps(dst, atmp, btmp, vlen_enc);
1020     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1021     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1022   } else if (!is_double_word && !is_min) {
1023     evpmovd2m(ktmp, b, vlen_enc);
1024     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1025     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1026     vmaxps(dst, atmp, btmp, vlen_enc);
1027     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1028     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1029   } else if (is_double_word && is_min) {
1030     evpmovq2m(ktmp, a, vlen_enc);
1031     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1032     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1033     vminpd(dst, atmp, btmp, vlen_enc);
1034     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1035     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1036   } else {
1037     assert(is_double_word && !is_min, "sanity");
1038     evpmovq2m(ktmp, b, vlen_enc);
1039     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1040     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1041     vmaxpd(dst, atmp, btmp, vlen_enc);
1042     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1043     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1044   }
1045 }
1046 
1047 void C2_MacroAssembler::vminmax_fp_avx10_2(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1048                                            XMMRegister src1, XMMRegister src2, int vlen_enc) {
1049   assert(opc == Op_MinV || opc == Op_MinReductionV ||
1050          opc == Op_MaxV || opc == Op_MaxReductionV, "sanity");
1051 
1052   int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_2_MINMAX_MIN_COMPARE_SIGN
1053                                                          : AVX10_2_MINMAX_MAX_COMPARE_SIGN;
1054   if (elem_bt == T_FLOAT) {
1055     evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc);
1056   } else {
1057     assert(elem_bt == T_DOUBLE, "");
1058     evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc);
1059   }
1060 }
1061 
1062 void C2_MacroAssembler::sminmax_fp_avx10_2(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1063                                            XMMRegister src1, XMMRegister src2) {
1064   assert(opc == Op_MinF || opc == Op_MaxF ||
1065          opc == Op_MinD || opc == Op_MaxD, "sanity");
1066 
1067   int imm8 = (opc == Op_MinF || opc == Op_MinD) ? AVX10_2_MINMAX_MIN_COMPARE_SIGN
1068                                                 : AVX10_2_MINMAX_MAX_COMPARE_SIGN;
1069   if (elem_bt == T_FLOAT) {
1070     evminmaxss(dst, mask, src1, src2, true, imm8);
1071   } else {
1072     assert(elem_bt == T_DOUBLE, "");
1073     evminmaxsd(dst, mask, src1, src2, true, imm8);
1074   }
1075 }
1076 
1077 // Float/Double signum
1078 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1079   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1080 
1081   Label DONE_LABEL;
1082 
1083   // Handle special cases +0.0/-0.0 and NaN, if argument is +0.0/-0.0 or NaN, return argument
1084   // If AVX10.2 (or newer) floating point comparison instructions used, SF=1 for equal and unordered cases
1085   // If other floating point comparison instructions used, ZF=1 for equal and unordered cases
1086   if (opcode == Op_SignumF) {
1087     if (VM_Version::supports_avx10_2()) {
1088       evucomxss(dst, zero);
1089       jcc(Assembler::negative, DONE_LABEL);
1090     } else {
1091       ucomiss(dst, zero);
1092       jcc(Assembler::equal, DONE_LABEL);
1093     }
1094     movflt(dst, one);
1095     jcc(Assembler::above, DONE_LABEL);
1096     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1097   } else if (opcode == Op_SignumD) {
1098     if (VM_Version::supports_avx10_2()) {
1099       evucomxsd(dst, zero);
1100       jcc(Assembler::negative, DONE_LABEL);
1101     } else {
1102       ucomisd(dst, zero);
1103       jcc(Assembler::equal, DONE_LABEL);
1104     }
1105     movdbl(dst, one);
1106     jcc(Assembler::above, DONE_LABEL);
1107     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1108   }
1109 
1110   bind(DONE_LABEL);
1111 }
1112 
1113 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1114   if (sign) {
1115     pmovsxbw(dst, src);
1116   } else {
1117     pmovzxbw(dst, src);
1118   }
1119 }
1120 
1121 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1122   if (sign) {
1123     vpmovsxbw(dst, src, vector_len);
1124   } else {
1125     vpmovzxbw(dst, src, vector_len);
1126   }
1127 }
1128 
1129 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1130   if (sign) {
1131     vpmovsxbd(dst, src, vector_len);
1132   } else {
1133     vpmovzxbd(dst, src, vector_len);
1134   }
1135 }
1136 
1137 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1138   if (sign) {
1139     vpmovsxwd(dst, src, vector_len);
1140   } else {
1141     vpmovzxwd(dst, src, vector_len);
1142   }
1143 }
1144 
1145 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1146                                      int shift, int vector_len) {
1147   if (opcode == Op_RotateLeftV) {
1148     if (etype == T_INT) {
1149       evprold(dst, src, shift, vector_len);
1150     } else {
1151       assert(etype == T_LONG, "expected type T_LONG");
1152       evprolq(dst, src, shift, vector_len);
1153     }
1154   } else {
1155     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1156     if (etype == T_INT) {
1157       evprord(dst, src, shift, vector_len);
1158     } else {
1159       assert(etype == T_LONG, "expected type T_LONG");
1160       evprorq(dst, src, shift, vector_len);
1161     }
1162   }
1163 }
1164 
1165 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1166                                      XMMRegister shift, int vector_len) {
1167   if (opcode == Op_RotateLeftV) {
1168     if (etype == T_INT) {
1169       evprolvd(dst, src, shift, vector_len);
1170     } else {
1171       assert(etype == T_LONG, "expected type T_LONG");
1172       evprolvq(dst, src, shift, vector_len);
1173     }
1174   } else {
1175     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1176     if (etype == T_INT) {
1177       evprorvd(dst, src, shift, vector_len);
1178     } else {
1179       assert(etype == T_LONG, "expected type T_LONG");
1180       evprorvq(dst, src, shift, vector_len);
1181     }
1182   }
1183 }
1184 
1185 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1186   if (opcode == Op_RShiftVI) {
1187     psrad(dst, shift);
1188   } else if (opcode == Op_LShiftVI) {
1189     pslld(dst, shift);
1190   } else {
1191     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1192     psrld(dst, shift);
1193   }
1194 }
1195 
1196 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1197   switch (opcode) {
1198     case Op_RShiftVI:  psrad(dst, shift); break;
1199     case Op_LShiftVI:  pslld(dst, shift); break;
1200     case Op_URShiftVI: psrld(dst, shift); break;
1201 
1202     default: assert(false, "%s", NodeClassNames[opcode]);
1203   }
1204 }
1205 
1206 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1207   if (opcode == Op_RShiftVI) {
1208     vpsrad(dst, nds, shift, vector_len);
1209   } else if (opcode == Op_LShiftVI) {
1210     vpslld(dst, nds, shift, vector_len);
1211   } else {
1212     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1213     vpsrld(dst, nds, shift, vector_len);
1214   }
1215 }
1216 
1217 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1218   switch (opcode) {
1219     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1220     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1221     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1222 
1223     default: assert(false, "%s", NodeClassNames[opcode]);
1224   }
1225 }
1226 
1227 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1228   switch (opcode) {
1229     case Op_RShiftVB:  // fall-through
1230     case Op_RShiftVS:  psraw(dst, shift); break;
1231 
1232     case Op_LShiftVB:  // fall-through
1233     case Op_LShiftVS:  psllw(dst, shift);   break;
1234 
1235     case Op_URShiftVS: // fall-through
1236     case Op_URShiftVB: psrlw(dst, shift);  break;
1237 
1238     default: assert(false, "%s", NodeClassNames[opcode]);
1239   }
1240 }
1241 
1242 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1243   switch (opcode) {
1244     case Op_RShiftVB:  // fall-through
1245     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1246 
1247     case Op_LShiftVB:  // fall-through
1248     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1249 
1250     case Op_URShiftVS: // fall-through
1251     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1252 
1253     default: assert(false, "%s", NodeClassNames[opcode]);
1254   }
1255 }
1256 
1257 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1258   switch (opcode) {
1259     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1260     case Op_LShiftVL:  psllq(dst, shift); break;
1261     case Op_URShiftVL: psrlq(dst, shift); break;
1262 
1263     default: assert(false, "%s", NodeClassNames[opcode]);
1264   }
1265 }
1266 
1267 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1268   if (opcode == Op_RShiftVL) {
1269     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1270   } else if (opcode == Op_LShiftVL) {
1271     psllq(dst, shift);
1272   } else {
1273     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1274     psrlq(dst, shift);
1275   }
1276 }
1277 
1278 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1279   switch (opcode) {
1280     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1281     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1282     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1283 
1284     default: assert(false, "%s", NodeClassNames[opcode]);
1285   }
1286 }
1287 
1288 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1289   if (opcode == Op_RShiftVL) {
1290     evpsraq(dst, nds, shift, vector_len);
1291   } else if (opcode == Op_LShiftVL) {
1292     vpsllq(dst, nds, shift, vector_len);
1293   } else {
1294     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1295     vpsrlq(dst, nds, shift, vector_len);
1296   }
1297 }
1298 
1299 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1300   switch (opcode) {
1301     case Op_RShiftVB:  // fall-through
1302     case Op_RShiftVS:  // fall-through
1303     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1304 
1305     case Op_LShiftVB:  // fall-through
1306     case Op_LShiftVS:  // fall-through
1307     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1308 
1309     case Op_URShiftVB: // fall-through
1310     case Op_URShiftVS: // fall-through
1311     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1312 
1313     default: assert(false, "%s", NodeClassNames[opcode]);
1314   }
1315 }
1316 
1317 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1318   switch (opcode) {
1319     case Op_RShiftVB:  // fall-through
1320     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1321 
1322     case Op_LShiftVB:  // fall-through
1323     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1324 
1325     case Op_URShiftVB: // fall-through
1326     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1327 
1328     default: assert(false, "%s", NodeClassNames[opcode]);
1329   }
1330 }
1331 
1332 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1333   assert(UseAVX >= 2, "required");
1334   switch (opcode) {
1335     case Op_RShiftVL: {
1336       if (UseAVX > 2) {
1337         assert(tmp == xnoreg, "not used");
1338         if (!VM_Version::supports_avx512vl()) {
1339           vlen_enc = Assembler::AVX_512bit;
1340         }
1341         evpsravq(dst, src, shift, vlen_enc);
1342       } else {
1343         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1344         vpsrlvq(dst, src, shift, vlen_enc);
1345         vpsrlvq(tmp, tmp, shift, vlen_enc);
1346         vpxor(dst, dst, tmp, vlen_enc);
1347         vpsubq(dst, dst, tmp, vlen_enc);
1348       }
1349       break;
1350     }
1351     case Op_LShiftVL: {
1352       assert(tmp == xnoreg, "not used");
1353       vpsllvq(dst, src, shift, vlen_enc);
1354       break;
1355     }
1356     case Op_URShiftVL: {
1357       assert(tmp == xnoreg, "not used");
1358       vpsrlvq(dst, src, shift, vlen_enc);
1359       break;
1360     }
1361     default: assert(false, "%s", NodeClassNames[opcode]);
1362   }
1363 }
1364 
1365 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1366 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1367   assert(opcode == Op_LShiftVB ||
1368          opcode == Op_RShiftVB ||
1369          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1370   bool sign = (opcode != Op_URShiftVB);
1371   assert(vector_len == 0, "required");
1372   vextendbd(sign, dst, src, 1);
1373   vpmovzxbd(vtmp, shift, 1);
1374   varshiftd(opcode, dst, dst, vtmp, 1);
1375   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1376   vextracti128_high(vtmp, dst);
1377   vpackusdw(dst, dst, vtmp, 0);
1378 }
1379 
1380 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1381 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1382   assert(opcode == Op_LShiftVB ||
1383          opcode == Op_RShiftVB ||
1384          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1385   bool sign = (opcode != Op_URShiftVB);
1386   int ext_vector_len = vector_len + 1;
1387   vextendbw(sign, dst, src, ext_vector_len);
1388   vpmovzxbw(vtmp, shift, ext_vector_len);
1389   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1390   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1391   if (vector_len == 0) {
1392     vextracti128_high(vtmp, dst);
1393     vpackuswb(dst, dst, vtmp, vector_len);
1394   } else {
1395     vextracti64x4_high(vtmp, dst);
1396     vpackuswb(dst, dst, vtmp, vector_len);
1397     vpermq(dst, dst, 0xD8, vector_len);
1398   }
1399 }
1400 
1401 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1402   switch(typ) {
1403     case T_BYTE:
1404       pinsrb(dst, val, idx);
1405       break;
1406     case T_SHORT:
1407       pinsrw(dst, val, idx);
1408       break;
1409     case T_INT:
1410       pinsrd(dst, val, idx);
1411       break;
1412     case T_LONG:
1413       pinsrq(dst, val, idx);
1414       break;
1415     default:
1416       assert(false,"Should not reach here.");
1417       break;
1418   }
1419 }
1420 
1421 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1422   switch(typ) {
1423     case T_BYTE:
1424       vpinsrb(dst, src, val, idx);
1425       break;
1426     case T_SHORT:
1427       vpinsrw(dst, src, val, idx);
1428       break;
1429     case T_INT:
1430       vpinsrd(dst, src, val, idx);
1431       break;
1432     case T_LONG:
1433       vpinsrq(dst, src, val, idx);
1434       break;
1435     default:
1436       assert(false,"Should not reach here.");
1437       break;
1438   }
1439 }
1440 
1441 void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst,
1442                                          Register base, Register idx_base,
1443                                          Register mask, Register mask_idx,
1444                                          Register rtmp, int vlen_enc) {
1445   vpxor(dst, dst, dst, vlen_enc);
1446   if (elem_bt == T_SHORT) {
1447     for (int i = 0; i < 4; i++) {
1448       // dst[i] = mask[i] ? src[idx_base[i]] : 0
1449       Label skip_load;
1450       btq(mask, mask_idx);
1451       jccb(Assembler::carryClear, skip_load);
1452       movl(rtmp, Address(idx_base, i * 4));
1453       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1454       bind(skip_load);
1455       incq(mask_idx);
1456     }
1457   } else {
1458     assert(elem_bt == T_BYTE, "");
1459     for (int i = 0; i < 8; i++) {
1460       // dst[i] = mask[i] ? src[idx_base[i]] : 0
1461       Label skip_load;
1462       btq(mask, mask_idx);
1463       jccb(Assembler::carryClear, skip_load);
1464       movl(rtmp, Address(idx_base, i * 4));
1465       pinsrb(dst, Address(base, rtmp), i);
1466       bind(skip_load);
1467       incq(mask_idx);
1468     }
1469   }
1470 }
1471 
1472 void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst,
1473                                   Register base, Register idx_base,
1474                                   Register rtmp, int vlen_enc) {
1475   vpxor(dst, dst, dst, vlen_enc);
1476   if (elem_bt == T_SHORT) {
1477     for (int i = 0; i < 4; i++) {
1478       // dst[i] = src[idx_base[i]]
1479       movl(rtmp, Address(idx_base, i * 4));
1480       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1481     }
1482   } else {
1483     assert(elem_bt == T_BYTE, "");
1484     for (int i = 0; i < 8; i++) {
1485       // dst[i] = src[idx_base[i]]
1486       movl(rtmp, Address(idx_base, i * 4));
1487       pinsrb(dst, Address(base, rtmp), i);
1488     }
1489   }
1490 }
1491 
1492 /*
1493  * Gather using hybrid algorithm, first partially unroll scalar loop
1494  * to accumulate values from gather indices into a quad-word(64bit) slice.
1495  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1496  * permutation to place the slice into appropriate vector lane
1497  * locations in destination vector. Following pseudo code describes the
1498  * algorithm in detail:
1499  *
1500  * DST_VEC = ZERO_VEC
1501  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1502  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1503  * FOREACH_ITER:
1504  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1505  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1506  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1507  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1508  *
1509  * With each iteration, doubleword permute indices (0,1) corresponding
1510  * to gathered quadword gets right shifted by two lane positions.
1511  *
1512  */
1513 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1514                                         Register base, Register idx_base,
1515                                         Register mask, XMMRegister xtmp1,
1516                                         XMMRegister xtmp2, XMMRegister temp_dst,
1517                                         Register rtmp, Register mask_idx,
1518                                         Register length, int vector_len, int vlen_enc) {
1519   Label GATHER8_LOOP;
1520   assert(is_subword_type(elem_ty), "");
1521   movl(length, vector_len);
1522   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1523   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1524   vallones(xtmp2, vlen_enc);
1525   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1526   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1527   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1528 
1529   bind(GATHER8_LOOP);
1530     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1531     if (mask == noreg) {
1532       vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc);
1533     } else {
1534       vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc);
1535     }
1536     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1537     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1538     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1539     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1540     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1541     vpor(dst, dst, temp_dst, vlen_enc);
1542     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1543     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1544     jcc(Assembler::notEqual, GATHER8_LOOP);
1545 }
1546 
1547 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1548   switch(typ) {
1549     case T_INT:
1550       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1551       break;
1552     case T_FLOAT:
1553       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1554       break;
1555     case T_LONG:
1556       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1557       break;
1558     case T_DOUBLE:
1559       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1560       break;
1561     default:
1562       assert(false,"Should not reach here.");
1563       break;
1564   }
1565 }
1566 
1567 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1568   switch(typ) {
1569     case T_INT:
1570       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1571       break;
1572     case T_FLOAT:
1573       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1574       break;
1575     case T_LONG:
1576       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1577       break;
1578     case T_DOUBLE:
1579       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1580       break;
1581     default:
1582       assert(false,"Should not reach here.");
1583       break;
1584   }
1585 }
1586 
1587 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1588   switch(typ) {
1589     case T_INT:
1590       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1591       break;
1592     case T_FLOAT:
1593       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1594       break;
1595     case T_LONG:
1596       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1597       break;
1598     case T_DOUBLE:
1599       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1600       break;
1601     default:
1602       assert(false,"Should not reach here.");
1603       break;
1604   }
1605 }
1606 
1607 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1608   if (vlen_in_bytes <= 16) {
1609     pxor (dst, dst);
1610     psubb(dst, src);
1611     switch (elem_bt) {
1612       case T_BYTE:   /* nothing to do */ break;
1613       case T_SHORT:  pmovsxbw(dst, dst); break;
1614       case T_INT:    pmovsxbd(dst, dst); break;
1615       case T_FLOAT:  pmovsxbd(dst, dst); break;
1616       case T_LONG:   pmovsxbq(dst, dst); break;
1617       case T_DOUBLE: pmovsxbq(dst, dst); break;
1618 
1619       default: assert(false, "%s", type2name(elem_bt));
1620     }
1621   } else {
1622     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1623     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1624 
1625     vpxor (dst, dst, dst, vlen_enc);
1626     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1627 
1628     switch (elem_bt) {
1629       case T_BYTE:   /* nothing to do */            break;
1630       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1631       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1632       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1633       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1634       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1635 
1636       default: assert(false, "%s", type2name(elem_bt));
1637     }
1638   }
1639 }
1640 
1641 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1642   if (novlbwdq) {
1643     vpmovsxbd(xtmp, src, vlen_enc);
1644     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1645             Assembler::eq, true, vlen_enc, noreg);
1646   } else {
1647     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1648     vpsubb(xtmp, xtmp, src, vlen_enc);
1649     evpmovb2m(dst, xtmp, vlen_enc);
1650   }
1651 }
1652 
1653 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1654   if (is_integral_type(bt)) {
1655     switch (vlen_in_bytes) {
1656       case 4:  movdl(dst, src);   break;
1657       case 8:  movq(dst, src);    break;
1658       case 16: movdqu(dst, src);  break;
1659       case 32: vmovdqu(dst, src); break;
1660       case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1661       default: ShouldNotReachHere();
1662     }
1663   } else {
1664     switch (vlen_in_bytes) {
1665       case 4:  movflt(dst, src); break;
1666       case 8:  movdbl(dst, src); break;
1667       case 16: movups(dst, src); break;
1668       case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1669       case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1670       default: ShouldNotReachHere();
1671     }
1672   }
1673 }
1674 
1675 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1676   assert(rscratch != noreg || always_reachable(src), "missing");
1677 
1678   if (reachable(src)) {
1679     load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1680   } else {
1681     lea(rscratch, src);
1682     load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1683   }
1684 }
1685 
1686 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1687   int vlen_enc = vector_length_encoding(vlen);
1688   if (VM_Version::supports_avx()) {
1689     if (bt == T_LONG) {
1690       if (VM_Version::supports_avx2()) {
1691         vpbroadcastq(dst, src, vlen_enc);
1692       } else {
1693         vmovddup(dst, src, vlen_enc);
1694       }
1695     } else if (bt == T_DOUBLE) {
1696       if (vlen_enc != Assembler::AVX_128bit) {
1697         vbroadcastsd(dst, src, vlen_enc, noreg);
1698       } else {
1699         vmovddup(dst, src, vlen_enc);
1700       }
1701     } else {
1702       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1703         vpbroadcastd(dst, src, vlen_enc);
1704       } else {
1705         vbroadcastss(dst, src, vlen_enc);
1706       }
1707     }
1708   } else if (VM_Version::supports_sse3()) {
1709     movddup(dst, src);
1710   } else {
1711     load_vector(bt, dst, src, vlen);
1712   }
1713 }
1714 
1715 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1716   int entry_idx = vector_iota_entry_index(bt);
1717   ExternalAddress addr(StubRoutines::x86::vector_iota_indices(entry_idx));
1718   load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1719 }
1720 
1721 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1722 
1723 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1724   int vector_len = Assembler::AVX_128bit;
1725 
1726   switch (opcode) {
1727     case Op_AndReductionV:  pand(dst, src); break;
1728     case Op_OrReductionV:   por (dst, src); break;
1729     case Op_XorReductionV:  pxor(dst, src); break;
1730     case Op_MinReductionV:
1731       switch (typ) {
1732         case T_BYTE:        pminsb(dst, src); break;
1733         case T_SHORT:       pminsw(dst, src); break;
1734         case T_INT:         pminsd(dst, src); break;
1735         case T_LONG:        assert(UseAVX > 2, "required");
1736                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1737         default:            assert(false, "wrong type");
1738       }
1739       break;
1740     case Op_MaxReductionV:
1741       switch (typ) {
1742         case T_BYTE:        pmaxsb(dst, src); break;
1743         case T_SHORT:       pmaxsw(dst, src); break;
1744         case T_INT:         pmaxsd(dst, src); break;
1745         case T_LONG:        assert(UseAVX > 2, "required");
1746                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1747         default:            assert(false, "wrong type");
1748       }
1749       break;
1750     case Op_UMinReductionV:
1751       switch (typ) {
1752         case T_BYTE:        vpminub(dst, dst, src, Assembler::AVX_128bit); break;
1753         case T_SHORT:       vpminuw(dst, dst, src, Assembler::AVX_128bit); break;
1754         case T_INT:         vpminud(dst, dst, src, Assembler::AVX_128bit); break;
1755         case T_LONG:        evpminuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break;
1756         default:            assert(false, "wrong type");
1757       }
1758       break;
1759     case Op_UMaxReductionV:
1760       switch (typ) {
1761         case T_BYTE:        vpmaxub(dst, dst, src, Assembler::AVX_128bit); break;
1762         case T_SHORT:       vpmaxuw(dst, dst, src, Assembler::AVX_128bit); break;
1763         case T_INT:         vpmaxud(dst, dst, src, Assembler::AVX_128bit); break;
1764         case T_LONG:        evpmaxuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break;
1765         default:            assert(false, "wrong type");
1766       }
1767       break;
1768     case Op_AddReductionVF: addss(dst, src); break;
1769     case Op_AddReductionVD: addsd(dst, src); break;
1770     case Op_AddReductionVI:
1771       switch (typ) {
1772         case T_BYTE:        paddb(dst, src); break;
1773         case T_SHORT:       paddw(dst, src); break;
1774         case T_INT:         paddd(dst, src); break;
1775         default:            assert(false, "wrong type");
1776       }
1777       break;
1778     case Op_AddReductionVL: paddq(dst, src); break;
1779     case Op_MulReductionVF: mulss(dst, src); break;
1780     case Op_MulReductionVD: mulsd(dst, src); break;
1781     case Op_MulReductionVI:
1782       switch (typ) {
1783         case T_SHORT:       pmullw(dst, src); break;
1784         case T_INT:         pmulld(dst, src); break;
1785         default:            assert(false, "wrong type");
1786       }
1787       break;
1788     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1789                             evpmullq(dst, dst, src, vector_len); break;
1790     default:                assert(false, "wrong opcode");
1791   }
1792 }
1793 
1794 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1795   switch (opcode) {
1796     case Op_AddReductionVF: addps(dst, src); break;
1797     case Op_AddReductionVD: addpd(dst, src); break;
1798     case Op_MulReductionVF: mulps(dst, src); break;
1799     case Op_MulReductionVD: mulpd(dst, src); break;
1800     default:                assert(false, "%s", NodeClassNames[opcode]);
1801   }
1802 }
1803 
1804 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1805   int vector_len = Assembler::AVX_256bit;
1806 
1807   switch (opcode) {
1808     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1809     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1810     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1811     case Op_MinReductionV:
1812       switch (typ) {
1813         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1814         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1815         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1816         case T_LONG:        assert(UseAVX > 2, "required");
1817                             vpminsq(dst, src1, src2, vector_len); break;
1818         default:            assert(false, "wrong type");
1819       }
1820       break;
1821     case Op_MaxReductionV:
1822       switch (typ) {
1823         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1824         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1825         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1826         case T_LONG:        assert(UseAVX > 2, "required");
1827                             vpmaxsq(dst, src1, src2, vector_len); break;
1828         default:            assert(false, "wrong type");
1829       }
1830       break;
1831     case Op_UMinReductionV:
1832       switch (typ) {
1833         case T_BYTE:        vpminub(dst, src1, src2, vector_len); break;
1834         case T_SHORT:       vpminuw(dst, src1, src2, vector_len); break;
1835         case T_INT:         vpminud(dst, src1, src2, vector_len); break;
1836         case T_LONG:        evpminuq(dst, k0, src1, src2, true, vector_len); break;
1837         default:            assert(false, "wrong type");
1838       }
1839       break;
1840     case Op_UMaxReductionV:
1841       switch (typ) {
1842         case T_BYTE:        vpmaxub(dst, src1, src2, vector_len); break;
1843         case T_SHORT:       vpmaxuw(dst, src1, src2, vector_len); break;
1844         case T_INT:         vpmaxud(dst, src1, src2, vector_len); break;
1845         case T_LONG:        evpmaxuq(dst, k0, src1, src2, true, vector_len); break;
1846         default:            assert(false, "wrong type");
1847       }
1848       break;
1849     case Op_AddReductionVI:
1850       switch (typ) {
1851         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1852         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1853         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1854         default:            assert(false, "wrong type");
1855       }
1856       break;
1857     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1858     case Op_MulReductionVI:
1859       switch (typ) {
1860         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1861         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1862         default:            assert(false, "wrong type");
1863       }
1864       break;
1865     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1866     default:                assert(false, "wrong opcode");
1867   }
1868 }
1869 
1870 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1871   int vector_len = Assembler::AVX_256bit;
1872 
1873   switch (opcode) {
1874     case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1875     case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1876     case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1877     case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1878     default:                assert(false, "%s", NodeClassNames[opcode]);
1879   }
1880 }
1881 
1882 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1883                                   XMMRegister dst, XMMRegister src,
1884                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1885   switch (opcode) {
1886     case Op_AddReductionVF:
1887     case Op_MulReductionVF:
1888       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1889       break;
1890 
1891     case Op_AddReductionVD:
1892     case Op_MulReductionVD:
1893       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1894       break;
1895 
1896     default: assert(false, "wrong opcode");
1897   }
1898 }
1899 
1900 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1901                                             XMMRegister dst, XMMRegister src,
1902                                             XMMRegister vtmp1, XMMRegister vtmp2) {
1903   switch (opcode) {
1904     case Op_AddReductionVF:
1905     case Op_MulReductionVF:
1906       unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1907       break;
1908 
1909     case Op_AddReductionVD:
1910     case Op_MulReductionVD:
1911       unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1912       break;
1913 
1914     default: assert(false, "%s", NodeClassNames[opcode]);
1915   }
1916 }
1917 
1918 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1919                              Register dst, Register src1, XMMRegister src2,
1920                              XMMRegister vtmp1, XMMRegister vtmp2) {
1921   switch (vlen) {
1922     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1923     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1924     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1925     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1926 
1927     default: assert(false, "wrong vector length");
1928   }
1929 }
1930 
1931 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1932                              Register dst, Register src1, XMMRegister src2,
1933                              XMMRegister vtmp1, XMMRegister vtmp2) {
1934   switch (vlen) {
1935     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1936     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1937     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1938     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1939 
1940     default: assert(false, "wrong vector length");
1941   }
1942 }
1943 
1944 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1945                              Register dst, Register src1, XMMRegister src2,
1946                              XMMRegister vtmp1, XMMRegister vtmp2) {
1947   switch (vlen) {
1948     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1949     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1950     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1951     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1952 
1953     default: assert(false, "wrong vector length");
1954   }
1955 }
1956 
1957 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1958                              Register dst, Register src1, XMMRegister src2,
1959                              XMMRegister vtmp1, XMMRegister vtmp2) {
1960   switch (vlen) {
1961     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1962     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1963     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1964     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1965 
1966     default: assert(false, "wrong vector length");
1967   }
1968 }
1969 
1970 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1971                              Register dst, Register src1, XMMRegister src2,
1972                              XMMRegister vtmp1, XMMRegister vtmp2) {
1973   switch (vlen) {
1974     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1975     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1976     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1977 
1978     default: assert(false, "wrong vector length");
1979   }
1980 }
1981 
1982 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1983   switch (vlen) {
1984     case 2:
1985       assert(vtmp2 == xnoreg, "");
1986       reduce2F(opcode, dst, src, vtmp1);
1987       break;
1988     case 4:
1989       assert(vtmp2 == xnoreg, "");
1990       reduce4F(opcode, dst, src, vtmp1);
1991       break;
1992     case 8:
1993       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1994       break;
1995     case 16:
1996       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1997       break;
1998     default: assert(false, "wrong vector length");
1999   }
2000 }
2001 
2002 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2003   switch (vlen) {
2004     case 2:
2005       assert(vtmp2 == xnoreg, "");
2006       reduce2D(opcode, dst, src, vtmp1);
2007       break;
2008     case 4:
2009       reduce4D(opcode, dst, src, vtmp1, vtmp2);
2010       break;
2011     case 8:
2012       reduce8D(opcode, dst, src, vtmp1, vtmp2);
2013       break;
2014     default: assert(false, "wrong vector length");
2015   }
2016 }
2017 
2018 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2019   switch (vlen) {
2020     case 2:
2021       assert(vtmp1 == xnoreg, "");
2022       assert(vtmp2 == xnoreg, "");
2023       unorderedReduce2F(opcode, dst, src);
2024       break;
2025     case 4:
2026       assert(vtmp2 == xnoreg, "");
2027       unorderedReduce4F(opcode, dst, src, vtmp1);
2028       break;
2029     case 8:
2030       unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
2031       break;
2032     case 16:
2033       unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
2034       break;
2035     default: assert(false, "wrong vector length");
2036   }
2037 }
2038 
2039 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2040   switch (vlen) {
2041     case 2:
2042       assert(vtmp1 == xnoreg, "");
2043       assert(vtmp2 == xnoreg, "");
2044       unorderedReduce2D(opcode, dst, src);
2045       break;
2046     case 4:
2047       assert(vtmp2 == xnoreg, "");
2048       unorderedReduce4D(opcode, dst, src, vtmp1);
2049       break;
2050     case 8:
2051       unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2052       break;
2053     default: assert(false, "wrong vector length");
2054   }
2055 }
2056 
2057 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2058   if (opcode == Op_AddReductionVI) {
2059     if (vtmp1 != src2) {
2060       movdqu(vtmp1, src2);
2061     }
2062     phaddd(vtmp1, vtmp1);
2063   } else {
2064     pshufd(vtmp1, src2, 0x1);
2065     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2066   }
2067   movdl(vtmp2, src1);
2068   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2069   movdl(dst, vtmp1);
2070 }
2071 
2072 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2073   if (opcode == Op_AddReductionVI) {
2074     if (vtmp1 != src2) {
2075       movdqu(vtmp1, src2);
2076     }
2077     phaddd(vtmp1, src2);
2078     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2079   } else {
2080     pshufd(vtmp2, src2, 0xE);
2081     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2082     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2083   }
2084 }
2085 
2086 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2087   if (opcode == Op_AddReductionVI) {
2088     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2089     vextracti128_high(vtmp2, vtmp1);
2090     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2091     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2092   } else {
2093     vextracti128_high(vtmp1, src2);
2094     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2095     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2096   }
2097 }
2098 
2099 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2100   vextracti64x4_high(vtmp2, src2);
2101   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2102   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2103 }
2104 
2105 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2106   pshufd(vtmp2, src2, 0x1);
2107   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2108   movdqu(vtmp1, vtmp2);
2109   psrldq(vtmp1, 2);
2110   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2111   movdqu(vtmp2, vtmp1);
2112   psrldq(vtmp2, 1);
2113   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2114   movdl(vtmp2, src1);
2115   if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) {
2116     pmovzxbd(vtmp1, vtmp1);
2117   } else {
2118     pmovsxbd(vtmp1, vtmp1);
2119   }
2120   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2121   pextrb(dst, vtmp1, 0x0);
2122   movsbl(dst, dst);
2123 }
2124 
2125 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2126   pshufd(vtmp1, src2, 0xE);
2127   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2128   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2129 }
2130 
2131 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2132   vextracti128_high(vtmp2, src2);
2133   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2134   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2135 }
2136 
2137 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2138   vextracti64x4_high(vtmp1, src2);
2139   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2140   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2141 }
2142 
2143 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2144   pmovsxbw(vtmp2, src2);
2145   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2146 }
2147 
2148 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2149   if (UseAVX > 1) {
2150     int vector_len = Assembler::AVX_256bit;
2151     vpmovsxbw(vtmp1, src2, vector_len);
2152     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2153   } else {
2154     pmovsxbw(vtmp2, src2);
2155     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2156     pshufd(vtmp2, src2, 0xe);
2157     pmovsxbw(vtmp2, vtmp2);
2158     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2159   }
2160 }
2161 
2162 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2163   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2164     int vector_len = Assembler::AVX_512bit;
2165     vpmovsxbw(vtmp1, src2, vector_len);
2166     reduce32S(opcode, dst, src1, vtmp1, vtmp2, vtmp1);
2167   } else {
2168     assert(UseAVX >= 2,"Should not reach here.");
2169     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2170     vextracti128_high(vtmp2, src2);
2171     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2172   }
2173 }
2174 
2175 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2176   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2177   vextracti64x4_high(vtmp2, src2);
2178   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2179 }
2180 
2181 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2182   if (opcode == Op_AddReductionVI) {
2183     if (vtmp1 != src2) {
2184       movdqu(vtmp1, src2);
2185     }
2186     phaddw(vtmp1, vtmp1);
2187     phaddw(vtmp1, vtmp1);
2188   } else {
2189     pshufd(vtmp2, src2, 0x1);
2190     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2191     movdqu(vtmp1, vtmp2);
2192     psrldq(vtmp1, 2);
2193     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2194   }
2195   movdl(vtmp2, src1);
2196   if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) {
2197     pmovzxwd(vtmp1, vtmp1);
2198   } else {
2199     pmovsxwd(vtmp1, vtmp1);
2200   }
2201   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2202   pextrw(dst, vtmp1, 0x0);
2203   movswl(dst, dst);
2204 }
2205 
2206 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2207   if (opcode == Op_AddReductionVI) {
2208     if (vtmp1 != src2) {
2209       movdqu(vtmp1, src2);
2210     }
2211     phaddw(vtmp1, src2);
2212   } else {
2213     assert_different_registers(src2, vtmp1);
2214     pshufd(vtmp1, src2, 0xE);
2215     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2216   }
2217   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2218 }
2219 
2220 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2221   if (opcode == Op_AddReductionVI) {
2222     int vector_len = Assembler::AVX_256bit;
2223     vphaddw(vtmp2, src2, src2, vector_len);
2224     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2225   } else {
2226     assert_different_registers(src2, vtmp2);
2227     vextracti128_high(vtmp2, src2);
2228     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2229   }
2230   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2231 }
2232 
2233 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2234   assert_different_registers(src2, vtmp1);
2235   int vector_len = Assembler::AVX_256bit;
2236   vextracti64x4_high(vtmp1, src2);
2237   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2238   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2239 }
2240 
2241 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2242   pshufd(vtmp2, src2, 0xE);
2243   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2244   movdq(vtmp1, src1);
2245   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2246   movdq(dst, vtmp1);
2247 }
2248 
2249 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2250   vextracti128_high(vtmp1, src2);
2251   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2252   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2253 }
2254 
2255 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2256   vextracti64x4_high(vtmp2, src2);
2257   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2258   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2259 }
2260 
2261 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2262   mov64(temp, -1L);
2263   bzhiq(temp, temp, len);
2264   kmovql(dst, temp);
2265 }
2266 
2267 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2268   reduce_operation_128(T_FLOAT, opcode, dst, src);
2269   pshufd(vtmp, src, 0x1);
2270   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2271 }
2272 
2273 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2274   reduce2F(opcode, dst, src, vtmp);
2275   pshufd(vtmp, src, 0x2);
2276   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2277   pshufd(vtmp, src, 0x3);
2278   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2279 }
2280 
2281 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2282   reduce4F(opcode, dst, src, vtmp2);
2283   vextractf128_high(vtmp2, src);
2284   reduce4F(opcode, dst, vtmp2, vtmp1);
2285 }
2286 
2287 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2288   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2289   vextracti64x4_high(vtmp1, src);
2290   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2291 }
2292 
2293 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2294   pshufd(dst, src, 0x1);
2295   reduce_operation_128(T_FLOAT, opcode, dst, src);
2296 }
2297 
2298 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2299   pshufd(vtmp, src, 0xE);
2300   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2301   unorderedReduce2F(opcode, dst, vtmp);
2302 }
2303 
2304 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2305   vextractf128_high(vtmp1, src);
2306   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2307   unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2308 }
2309 
2310 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2311   vextractf64x4_high(vtmp2, src);
2312   unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2313   unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2314 }
2315 
2316 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2317   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2318   pshufd(vtmp, src, 0xE);
2319   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2320 }
2321 
2322 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2323   reduce2D(opcode, dst, src, vtmp2);
2324   vextractf128_high(vtmp2, src);
2325   reduce2D(opcode, dst, vtmp2, vtmp1);
2326 }
2327 
2328 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2329   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2330   vextracti64x4_high(vtmp1, src);
2331   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2332 }
2333 
2334 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2335   pshufd(dst, src, 0xE);
2336   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2337 }
2338 
2339 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2340   vextractf128_high(vtmp, src);
2341   unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2342   unorderedReduce2D(opcode, dst, vtmp);
2343 }
2344 
2345 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2346   vextractf64x4_high(vtmp2, src);
2347   unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2348   unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2349 }
2350 
2351 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2352   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2353 }
2354 
2355 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2356   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2357 }
2358 
2359 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2360   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2361 }
2362 
2363 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2364                                  int vec_enc) {
2365   switch(elem_bt) {
2366     case T_INT:
2367     case T_FLOAT:
2368       vmaskmovps(dst, src, mask, vec_enc);
2369       break;
2370     case T_LONG:
2371     case T_DOUBLE:
2372       vmaskmovpd(dst, src, mask, vec_enc);
2373       break;
2374     default:
2375       fatal("Unsupported type %s", type2name(elem_bt));
2376       break;
2377   }
2378 }
2379 
2380 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2381                                  int vec_enc) {
2382   switch(elem_bt) {
2383     case T_INT:
2384     case T_FLOAT:
2385       vmaskmovps(dst, src, mask, vec_enc);
2386       break;
2387     case T_LONG:
2388     case T_DOUBLE:
2389       vmaskmovpd(dst, src, mask, vec_enc);
2390       break;
2391     default:
2392       fatal("Unsupported type %s", type2name(elem_bt));
2393       break;
2394   }
2395 }
2396 
2397 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2398                                           XMMRegister dst, XMMRegister src,
2399                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2400                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2401   const int permconst[] = {1, 14};
2402   XMMRegister wsrc = src;
2403   XMMRegister wdst = xmm_0;
2404   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2405 
2406   int vlen_enc = Assembler::AVX_128bit;
2407   if (vlen == 16) {
2408     vlen_enc = Assembler::AVX_256bit;
2409   }
2410 
2411   for (int i = log2(vlen) - 1; i >=0; i--) {
2412     if (i == 0 && !is_dst_valid) {
2413       wdst = dst;
2414     }
2415     if (i == 3) {
2416       vextracti64x4_high(wtmp, wsrc);
2417     } else if (i == 2) {
2418       vextracti128_high(wtmp, wsrc);
2419     } else { // i = [0,1]
2420       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2421     }
2422 
2423     if (VM_Version::supports_avx10_2()) {
2424       vminmax_fp_avx10_2(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc);
2425     } else {
2426       vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2427     }
2428     wsrc = wdst;
2429     vlen_enc = Assembler::AVX_128bit;
2430   }
2431   if (is_dst_valid) {
2432     if (VM_Version::supports_avx10_2()) {
2433       vminmax_fp_avx10_2(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit);
2434     } else {
2435       vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2436     }
2437   }
2438 }
2439 
2440 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2441                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2442                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2443   XMMRegister wsrc = src;
2444   XMMRegister wdst = xmm_0;
2445   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2446   int vlen_enc = Assembler::AVX_128bit;
2447   if (vlen == 8) {
2448     vlen_enc = Assembler::AVX_256bit;
2449   }
2450   for (int i = log2(vlen) - 1; i >=0; i--) {
2451     if (i == 0 && !is_dst_valid) {
2452       wdst = dst;
2453     }
2454     if (i == 1) {
2455       vextracti128_high(wtmp, wsrc);
2456     } else if (i == 2) {
2457       vextracti64x4_high(wtmp, wsrc);
2458     } else {
2459       assert(i == 0, "%d", i);
2460       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2461     }
2462 
2463     if (VM_Version::supports_avx10_2()) {
2464       vminmax_fp_avx10_2(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc);
2465     } else {
2466       vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2467     }
2468 
2469     wsrc = wdst;
2470     vlen_enc = Assembler::AVX_128bit;
2471   }
2472 
2473   if (is_dst_valid) {
2474     if (VM_Version::supports_avx10_2()) {
2475       vminmax_fp_avx10_2(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit);
2476     } else {
2477       vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2478     }
2479   }
2480 }
2481 
2482 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2483   switch (bt) {
2484     case T_BYTE:  pextrb(dst, src, idx); break;
2485     case T_SHORT: pextrw(dst, src, idx); break;
2486     case T_INT:   pextrd(dst, src, idx); break;
2487     case T_LONG:  pextrq(dst, src, idx); break;
2488 
2489     default:
2490       assert(false,"Should not reach here.");
2491       break;
2492   }
2493 }
2494 
2495 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2496   int esize =  type2aelembytes(typ);
2497   int elem_per_lane = 16/esize;
2498   int lane = elemindex / elem_per_lane;
2499   int eindex = elemindex % elem_per_lane;
2500 
2501   if (lane >= 2) {
2502     assert(UseAVX > 2, "required");
2503     vextractf32x4(dst, src, lane & 3);
2504     return dst;
2505   } else if (lane > 0) {
2506     assert(UseAVX > 0, "required");
2507     vextractf128(dst, src, lane);
2508     return dst;
2509   } else {
2510     return src;
2511   }
2512 }
2513 
2514 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2515   if (typ == T_BYTE) {
2516     movsbl(dst, dst);
2517   } else if (typ == T_SHORT) {
2518     movswl(dst, dst);
2519   }
2520 }
2521 
2522 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2523   int esize =  type2aelembytes(typ);
2524   int elem_per_lane = 16/esize;
2525   int eindex = elemindex % elem_per_lane;
2526   assert(is_integral_type(typ),"required");
2527 
2528   if (eindex == 0) {
2529     if (typ == T_LONG) {
2530       movq(dst, src);
2531     } else {
2532       movdl(dst, src);
2533       movsxl(typ, dst);
2534     }
2535   } else {
2536     extract(typ, dst, src, eindex);
2537     movsxl(typ, dst);
2538   }
2539 }
2540 
2541 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2542   int esize =  type2aelembytes(typ);
2543   int elem_per_lane = 16/esize;
2544   int eindex = elemindex % elem_per_lane;
2545   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2546 
2547   if (eindex == 0) {
2548     movq(dst, src);
2549   } else {
2550     if (typ == T_FLOAT) {
2551       if (UseAVX == 0) {
2552         movdqu(dst, src);
2553         shufps(dst, dst, eindex);
2554       } else {
2555         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2556       }
2557     } else {
2558       if (UseAVX == 0) {
2559         movdqu(dst, src);
2560         psrldq(dst, eindex*esize);
2561       } else {
2562         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2563       }
2564       movq(dst, dst);
2565     }
2566   }
2567   // Zero upper bits
2568   if (typ == T_FLOAT) {
2569     if (UseAVX == 0) {
2570       assert(vtmp != xnoreg, "required.");
2571       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2572       pand(dst, vtmp);
2573     } else {
2574       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2575     }
2576   }
2577 }
2578 
2579 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2580   switch(typ) {
2581     case T_BYTE:
2582     case T_BOOLEAN:
2583       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2584       break;
2585     case T_SHORT:
2586     case T_CHAR:
2587       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2588       break;
2589     case T_INT:
2590     case T_FLOAT:
2591       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2592       break;
2593     case T_LONG:
2594     case T_DOUBLE:
2595       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2596       break;
2597     default:
2598       assert(false,"Should not reach here.");
2599       break;
2600   }
2601 }
2602 
2603 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2604   assert(rscratch != noreg || always_reachable(src2), "missing");
2605 
2606   switch(typ) {
2607     case T_BOOLEAN:
2608     case T_BYTE:
2609       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2610       break;
2611     case T_CHAR:
2612     case T_SHORT:
2613       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2614       break;
2615     case T_INT:
2616     case T_FLOAT:
2617       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2618       break;
2619     case T_LONG:
2620     case T_DOUBLE:
2621       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2622       break;
2623     default:
2624       assert(false,"Should not reach here.");
2625       break;
2626   }
2627 }
2628 
2629 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2630   switch(typ) {
2631     case T_BYTE:
2632       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2633       break;
2634     case T_SHORT:
2635       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2636       break;
2637     case T_INT:
2638     case T_FLOAT:
2639       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2640       break;
2641     case T_LONG:
2642     case T_DOUBLE:
2643       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2644       break;
2645     default:
2646       assert(false,"Should not reach here.");
2647       break;
2648   }
2649 }
2650 
2651 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2652   assert(vlen_in_bytes <= 32, "");
2653   int esize = type2aelembytes(bt);
2654   if (vlen_in_bytes == 32) {
2655     assert(vtmp == xnoreg, "required.");
2656     if (esize >= 4) {
2657       vtestps(src1, src2, AVX_256bit);
2658     } else {
2659       vptest(src1, src2, AVX_256bit);
2660     }
2661     return;
2662   }
2663   if (vlen_in_bytes < 16) {
2664     // Duplicate the lower part to fill the whole register,
2665     // Don't need to do so for src2
2666     assert(vtmp != xnoreg, "required");
2667     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2668     pshufd(vtmp, src1, shuffle_imm);
2669   } else {
2670     assert(vtmp == xnoreg, "required");
2671     vtmp = src1;
2672   }
2673   if (esize >= 4 && VM_Version::supports_avx()) {
2674     vtestps(vtmp, src2, AVX_128bit);
2675   } else {
2676     ptest(vtmp, src2);
2677   }
2678 }
2679 
2680 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2681 #ifdef ASSERT
2682   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2683   bool is_bw_supported = VM_Version::supports_avx512bw();
2684   if (is_bw && !is_bw_supported) {
2685     assert(vlen_enc != Assembler::AVX_512bit, "required");
2686     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2687            "XMM register should be 0-15");
2688   }
2689 #endif // ASSERT
2690   switch (elem_bt) {
2691     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2692     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2693     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2694     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2695     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2696     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2697     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2698   }
2699 }
2700 
2701 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2702   assert(UseAVX >= 2, "required");
2703   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2704   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2705   if ((UseAVX > 2) &&
2706       (!is_bw || VM_Version::supports_avx512bw()) &&
2707       (!is_vl || VM_Version::supports_avx512vl())) {
2708     switch (elem_bt) {
2709       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2710       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2711       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2712       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2713       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2714     }
2715   } else {
2716     assert(vlen_enc != Assembler::AVX_512bit, "required");
2717     assert((dst->encoding() < 16),"XMM register should be 0-15");
2718     switch (elem_bt) {
2719       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2720       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2721       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2722       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2723       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2724       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2725       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2726     }
2727   }
2728 }
2729 
2730 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2731   switch (to_elem_bt) {
2732     case T_SHORT:
2733       vpmovsxbw(dst, src, vlen_enc);
2734       break;
2735     case T_INT:
2736       vpmovsxbd(dst, src, vlen_enc);
2737       break;
2738     case T_FLOAT:
2739       vpmovsxbd(dst, src, vlen_enc);
2740       vcvtdq2ps(dst, dst, vlen_enc);
2741       break;
2742     case T_LONG:
2743       vpmovsxbq(dst, src, vlen_enc);
2744       break;
2745     case T_DOUBLE: {
2746       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2747       vpmovsxbd(dst, src, mid_vlen_enc);
2748       vcvtdq2pd(dst, dst, vlen_enc);
2749       break;
2750     }
2751     default:
2752       fatal("Unsupported type %s", type2name(to_elem_bt));
2753       break;
2754   }
2755 }
2756 
2757 //-------------------------------------------------------------------------------------------
2758 
2759 // IndexOf for constant substrings with size >= 8 chars
2760 // which don't need to be loaded through stack.
2761 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2762                                          Register cnt1, Register cnt2,
2763                                          int int_cnt2,  Register result,
2764                                          XMMRegister vec, Register tmp,
2765                                          int ae) {
2766   ShortBranchVerifier sbv(this);
2767   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2768   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2769 
2770   // This method uses the pcmpestri instruction with bound registers
2771   //   inputs:
2772   //     xmm - substring
2773   //     rax - substring length (elements count)
2774   //     mem - scanned string
2775   //     rdx - string length (elements count)
2776   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2777   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2778   //   outputs:
2779   //     rcx - matched index in string
2780   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2781   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2782   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2783   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2784   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2785 
2786   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2787         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2788         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2789 
2790   // Note, inline_string_indexOf() generates checks:
2791   // if (substr.count > string.count) return -1;
2792   // if (substr.count == 0) return 0;
2793   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2794 
2795   // Load substring.
2796   if (ae == StrIntrinsicNode::UL) {
2797     pmovzxbw(vec, Address(str2, 0));
2798   } else {
2799     movdqu(vec, Address(str2, 0));
2800   }
2801   movl(cnt2, int_cnt2);
2802   movptr(result, str1); // string addr
2803 
2804   if (int_cnt2 > stride) {
2805     jmpb(SCAN_TO_SUBSTR);
2806 
2807     // Reload substr for rescan, this code
2808     // is executed only for large substrings (> 8 chars)
2809     bind(RELOAD_SUBSTR);
2810     if (ae == StrIntrinsicNode::UL) {
2811       pmovzxbw(vec, Address(str2, 0));
2812     } else {
2813       movdqu(vec, Address(str2, 0));
2814     }
2815     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2816 
2817     bind(RELOAD_STR);
2818     // We came here after the beginning of the substring was
2819     // matched but the rest of it was not so we need to search
2820     // again. Start from the next element after the previous match.
2821 
2822     // cnt2 is number of substring reminding elements and
2823     // cnt1 is number of string reminding elements when cmp failed.
2824     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2825     subl(cnt1, cnt2);
2826     addl(cnt1, int_cnt2);
2827     movl(cnt2, int_cnt2); // Now restore cnt2
2828 
2829     decrementl(cnt1);     // Shift to next element
2830     cmpl(cnt1, cnt2);
2831     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2832 
2833     addptr(result, (1<<scale1));
2834 
2835   } // (int_cnt2 > 8)
2836 
2837   // Scan string for start of substr in 16-byte vectors
2838   bind(SCAN_TO_SUBSTR);
2839   pcmpestri(vec, Address(result, 0), mode);
2840   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2841   subl(cnt1, stride);
2842   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2843   cmpl(cnt1, cnt2);
2844   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2845   addptr(result, 16);
2846   jmpb(SCAN_TO_SUBSTR);
2847 
2848   // Found a potential substr
2849   bind(FOUND_CANDIDATE);
2850   // Matched whole vector if first element matched (tmp(rcx) == 0).
2851   if (int_cnt2 == stride) {
2852     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2853   } else { // int_cnt2 > 8
2854     jccb(Assembler::overflow, FOUND_SUBSTR);
2855   }
2856   // After pcmpestri tmp(rcx) contains matched element index
2857   // Compute start addr of substr
2858   lea(result, Address(result, tmp, scale1));
2859 
2860   // Make sure string is still long enough
2861   subl(cnt1, tmp);
2862   cmpl(cnt1, cnt2);
2863   if (int_cnt2 == stride) {
2864     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2865   } else { // int_cnt2 > 8
2866     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2867   }
2868   // Left less then substring.
2869 
2870   bind(RET_NOT_FOUND);
2871   movl(result, -1);
2872   jmp(EXIT);
2873 
2874   if (int_cnt2 > stride) {
2875     // This code is optimized for the case when whole substring
2876     // is matched if its head is matched.
2877     bind(MATCH_SUBSTR_HEAD);
2878     pcmpestri(vec, Address(result, 0), mode);
2879     // Reload only string if does not match
2880     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2881 
2882     Label CONT_SCAN_SUBSTR;
2883     // Compare the rest of substring (> 8 chars).
2884     bind(FOUND_SUBSTR);
2885     // First 8 chars are already matched.
2886     negptr(cnt2);
2887     addptr(cnt2, stride);
2888 
2889     bind(SCAN_SUBSTR);
2890     subl(cnt1, stride);
2891     cmpl(cnt2, -stride); // Do not read beyond substring
2892     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2893     // Back-up strings to avoid reading beyond substring:
2894     // cnt1 = cnt1 - cnt2 + 8
2895     addl(cnt1, cnt2); // cnt2 is negative
2896     addl(cnt1, stride);
2897     movl(cnt2, stride); negptr(cnt2);
2898     bind(CONT_SCAN_SUBSTR);
2899     if (int_cnt2 < (int)G) {
2900       int tail_off1 = int_cnt2<<scale1;
2901       int tail_off2 = int_cnt2<<scale2;
2902       if (ae == StrIntrinsicNode::UL) {
2903         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2904       } else {
2905         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2906       }
2907       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2908     } else {
2909       // calculate index in register to avoid integer overflow (int_cnt2*2)
2910       movl(tmp, int_cnt2);
2911       addptr(tmp, cnt2);
2912       if (ae == StrIntrinsicNode::UL) {
2913         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2914       } else {
2915         movdqu(vec, Address(str2, tmp, scale2, 0));
2916       }
2917       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2918     }
2919     // Need to reload strings pointers if not matched whole vector
2920     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2921     addptr(cnt2, stride);
2922     jcc(Assembler::negative, SCAN_SUBSTR);
2923     // Fall through if found full substring
2924 
2925   } // (int_cnt2 > 8)
2926 
2927   bind(RET_FOUND);
2928   // Found result if we matched full small substring.
2929   // Compute substr offset
2930   subptr(result, str1);
2931   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2932     shrl(result, 1); // index
2933   }
2934   bind(EXIT);
2935 
2936 } // string_indexofC8
2937 
2938 // Small strings are loaded through stack if they cross page boundary.
2939 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2940                                        Register cnt1, Register cnt2,
2941                                        int int_cnt2,  Register result,
2942                                        XMMRegister vec, Register tmp,
2943                                        int ae) {
2944   ShortBranchVerifier sbv(this);
2945   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2946   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2947 
2948   //
2949   // int_cnt2 is length of small (< 8 chars) constant substring
2950   // or (-1) for non constant substring in which case its length
2951   // is in cnt2 register.
2952   //
2953   // Note, inline_string_indexOf() generates checks:
2954   // if (substr.count > string.count) return -1;
2955   // if (substr.count == 0) return 0;
2956   //
2957   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2958   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2959   // This method uses the pcmpestri instruction with bound registers
2960   //   inputs:
2961   //     xmm - substring
2962   //     rax - substring length (elements count)
2963   //     mem - scanned string
2964   //     rdx - string length (elements count)
2965   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2966   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2967   //   outputs:
2968   //     rcx - matched index in string
2969   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2970   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2971   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2972   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2973 
2974   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2975         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2976         FOUND_CANDIDATE;
2977 
2978   { //========================================================
2979     // We don't know where these strings are located
2980     // and we can't read beyond them. Load them through stack.
2981     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2982 
2983     movptr(tmp, rsp); // save old SP
2984 
2985     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2986       if (int_cnt2 == (1>>scale2)) { // One byte
2987         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2988         load_unsigned_byte(result, Address(str2, 0));
2989         movdl(vec, result); // move 32 bits
2990       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2991         // Not enough header space in 32-bit VM: 12+3 = 15.
2992         movl(result, Address(str2, -1));
2993         shrl(result, 8);
2994         movdl(vec, result); // move 32 bits
2995       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2996         load_unsigned_short(result, Address(str2, 0));
2997         movdl(vec, result); // move 32 bits
2998       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2999         movdl(vec, Address(str2, 0)); // move 32 bits
3000       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
3001         movq(vec, Address(str2, 0));  // move 64 bits
3002       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
3003         // Array header size is 12 bytes in 32-bit VM
3004         // + 6 bytes for 3 chars == 18 bytes,
3005         // enough space to load vec and shift.
3006         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
3007         if (ae == StrIntrinsicNode::UL) {
3008           int tail_off = int_cnt2-8;
3009           pmovzxbw(vec, Address(str2, tail_off));
3010           psrldq(vec, -2*tail_off);
3011         }
3012         else {
3013           int tail_off = int_cnt2*(1<<scale2);
3014           movdqu(vec, Address(str2, tail_off-16));
3015           psrldq(vec, 16-tail_off);
3016         }
3017       }
3018     } else { // not constant substring
3019       cmpl(cnt2, stride);
3020       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3021 
3022       // We can read beyond string if srt+16 does not cross page boundary
3023       // since heaps are aligned and mapped by pages.
3024       assert(os::vm_page_size() < (int)G, "default page should be small");
3025       movl(result, str2); // We need only low 32 bits
3026       andl(result, ((int)os::vm_page_size()-1));
3027       cmpl(result, ((int)os::vm_page_size()-16));
3028       jccb(Assembler::belowEqual, CHECK_STR);
3029 
3030       // Move small strings to stack to allow load 16 bytes into vec.
3031       subptr(rsp, 16);
3032       int stk_offset = wordSize-(1<<scale2);
3033       push(cnt2);
3034 
3035       bind(COPY_SUBSTR);
3036       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3037         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3038         movb(Address(rsp, cnt2, scale2, stk_offset), result);
3039       } else if (ae == StrIntrinsicNode::UU) {
3040         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3041         movw(Address(rsp, cnt2, scale2, stk_offset), result);
3042       }
3043       decrement(cnt2);
3044       jccb(Assembler::notZero, COPY_SUBSTR);
3045 
3046       pop(cnt2);
3047       movptr(str2, rsp);  // New substring address
3048     } // non constant
3049 
3050     bind(CHECK_STR);
3051     cmpl(cnt1, stride);
3052     jccb(Assembler::aboveEqual, BIG_STRINGS);
3053 
3054     // Check cross page boundary.
3055     movl(result, str1); // We need only low 32 bits
3056     andl(result, ((int)os::vm_page_size()-1));
3057     cmpl(result, ((int)os::vm_page_size()-16));
3058     jccb(Assembler::belowEqual, BIG_STRINGS);
3059 
3060     subptr(rsp, 16);
3061     int stk_offset = -(1<<scale1);
3062     if (int_cnt2 < 0) { // not constant
3063       push(cnt2);
3064       stk_offset += wordSize;
3065     }
3066     movl(cnt2, cnt1);
3067 
3068     bind(COPY_STR);
3069     if (ae == StrIntrinsicNode::LL) {
3070       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3071       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3072     } else {
3073       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3074       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3075     }
3076     decrement(cnt2);
3077     jccb(Assembler::notZero, COPY_STR);
3078 
3079     if (int_cnt2 < 0) { // not constant
3080       pop(cnt2);
3081     }
3082     movptr(str1, rsp);  // New string address
3083 
3084     bind(BIG_STRINGS);
3085     // Load substring.
3086     if (int_cnt2 < 0) { // -1
3087       if (ae == StrIntrinsicNode::UL) {
3088         pmovzxbw(vec, Address(str2, 0));
3089       } else {
3090         movdqu(vec, Address(str2, 0));
3091       }
3092       push(cnt2);       // substr count
3093       push(str2);       // substr addr
3094       push(str1);       // string addr
3095     } else {
3096       // Small (< 8 chars) constant substrings are loaded already.
3097       movl(cnt2, int_cnt2);
3098     }
3099     push(tmp);  // original SP
3100 
3101   } // Finished loading
3102 
3103   //========================================================
3104   // Start search
3105   //
3106 
3107   movptr(result, str1); // string addr
3108 
3109   if (int_cnt2  < 0) {  // Only for non constant substring
3110     jmpb(SCAN_TO_SUBSTR);
3111 
3112     // SP saved at sp+0
3113     // String saved at sp+1*wordSize
3114     // Substr saved at sp+2*wordSize
3115     // Substr count saved at sp+3*wordSize
3116 
3117     // Reload substr for rescan, this code
3118     // is executed only for large substrings (> 8 chars)
3119     bind(RELOAD_SUBSTR);
3120     movptr(str2, Address(rsp, 2*wordSize));
3121     movl(cnt2, Address(rsp, 3*wordSize));
3122     if (ae == StrIntrinsicNode::UL) {
3123       pmovzxbw(vec, Address(str2, 0));
3124     } else {
3125       movdqu(vec, Address(str2, 0));
3126     }
3127     // We came here after the beginning of the substring was
3128     // matched but the rest of it was not so we need to search
3129     // again. Start from the next element after the previous match.
3130     subptr(str1, result); // Restore counter
3131     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3132       shrl(str1, 1);
3133     }
3134     addl(cnt1, str1);
3135     decrementl(cnt1);   // Shift to next element
3136     cmpl(cnt1, cnt2);
3137     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3138 
3139     addptr(result, (1<<scale1));
3140   } // non constant
3141 
3142   // Scan string for start of substr in 16-byte vectors
3143   bind(SCAN_TO_SUBSTR);
3144   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3145   pcmpestri(vec, Address(result, 0), mode);
3146   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3147   subl(cnt1, stride);
3148   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3149   cmpl(cnt1, cnt2);
3150   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3151   addptr(result, 16);
3152 
3153   bind(ADJUST_STR);
3154   cmpl(cnt1, stride); // Do not read beyond string
3155   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3156   // Back-up string to avoid reading beyond string.
3157   lea(result, Address(result, cnt1, scale1, -16));
3158   movl(cnt1, stride);
3159   jmpb(SCAN_TO_SUBSTR);
3160 
3161   // Found a potential substr
3162   bind(FOUND_CANDIDATE);
3163   // After pcmpestri tmp(rcx) contains matched element index
3164 
3165   // Make sure string is still long enough
3166   subl(cnt1, tmp);
3167   cmpl(cnt1, cnt2);
3168   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3169   // Left less then substring.
3170 
3171   bind(RET_NOT_FOUND);
3172   movl(result, -1);
3173   jmp(CLEANUP);
3174 
3175   bind(FOUND_SUBSTR);
3176   // Compute start addr of substr
3177   lea(result, Address(result, tmp, scale1));
3178   if (int_cnt2 > 0) { // Constant substring
3179     // Repeat search for small substring (< 8 chars)
3180     // from new point without reloading substring.
3181     // Have to check that we don't read beyond string.
3182     cmpl(tmp, stride-int_cnt2);
3183     jccb(Assembler::greater, ADJUST_STR);
3184     // Fall through if matched whole substring.
3185   } else { // non constant
3186     assert(int_cnt2 == -1, "should be != 0");
3187 
3188     addl(tmp, cnt2);
3189     // Found result if we matched whole substring.
3190     cmpl(tmp, stride);
3191     jcc(Assembler::lessEqual, RET_FOUND);
3192 
3193     // Repeat search for small substring (<= 8 chars)
3194     // from new point 'str1' without reloading substring.
3195     cmpl(cnt2, stride);
3196     // Have to check that we don't read beyond string.
3197     jccb(Assembler::lessEqual, ADJUST_STR);
3198 
3199     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3200     // Compare the rest of substring (> 8 chars).
3201     movptr(str1, result);
3202 
3203     cmpl(tmp, cnt2);
3204     // First 8 chars are already matched.
3205     jccb(Assembler::equal, CHECK_NEXT);
3206 
3207     bind(SCAN_SUBSTR);
3208     pcmpestri(vec, Address(str1, 0), mode);
3209     // Need to reload strings pointers if not matched whole vector
3210     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3211 
3212     bind(CHECK_NEXT);
3213     subl(cnt2, stride);
3214     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3215     addptr(str1, 16);
3216     if (ae == StrIntrinsicNode::UL) {
3217       addptr(str2, 8);
3218     } else {
3219       addptr(str2, 16);
3220     }
3221     subl(cnt1, stride);
3222     cmpl(cnt2, stride); // Do not read beyond substring
3223     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3224     // Back-up strings to avoid reading beyond substring.
3225 
3226     if (ae == StrIntrinsicNode::UL) {
3227       lea(str2, Address(str2, cnt2, scale2, -8));
3228       lea(str1, Address(str1, cnt2, scale1, -16));
3229     } else {
3230       lea(str2, Address(str2, cnt2, scale2, -16));
3231       lea(str1, Address(str1, cnt2, scale1, -16));
3232     }
3233     subl(cnt1, cnt2);
3234     movl(cnt2, stride);
3235     addl(cnt1, stride);
3236     bind(CONT_SCAN_SUBSTR);
3237     if (ae == StrIntrinsicNode::UL) {
3238       pmovzxbw(vec, Address(str2, 0));
3239     } else {
3240       movdqu(vec, Address(str2, 0));
3241     }
3242     jmp(SCAN_SUBSTR);
3243 
3244     bind(RET_FOUND_LONG);
3245     movptr(str1, Address(rsp, wordSize));
3246   } // non constant
3247 
3248   bind(RET_FOUND);
3249   // Compute substr offset
3250   subptr(result, str1);
3251   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3252     shrl(result, 1); // index
3253   }
3254   bind(CLEANUP);
3255   pop(rsp); // restore SP
3256 
3257 } // string_indexof
3258 
3259 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3260                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3261   ShortBranchVerifier sbv(this);
3262   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3263 
3264   int stride = 8;
3265 
3266   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3267         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3268         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3269         FOUND_SEQ_CHAR, DONE_LABEL;
3270 
3271   movptr(result, str1);
3272   if (UseAVX >= 2) {
3273     cmpl(cnt1, stride);
3274     jcc(Assembler::less, SCAN_TO_CHAR);
3275     cmpl(cnt1, 2*stride);
3276     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3277     movdl(vec1, ch);
3278     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3279     vpxor(vec2, vec2);
3280     movl(tmp, cnt1);
3281     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3282     andl(cnt1,0x0000000F);  //tail count (in chars)
3283 
3284     bind(SCAN_TO_16_CHAR_LOOP);
3285     vmovdqu(vec3, Address(result, 0));
3286     vpcmpeqw(vec3, vec3, vec1, 1);
3287     vptest(vec2, vec3);
3288     jcc(Assembler::carryClear, FOUND_CHAR);
3289     addptr(result, 32);
3290     subl(tmp, 2*stride);
3291     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3292     jmp(SCAN_TO_8_CHAR);
3293     bind(SCAN_TO_8_CHAR_INIT);
3294     movdl(vec1, ch);
3295     pshuflw(vec1, vec1, 0x00);
3296     pshufd(vec1, vec1, 0);
3297     pxor(vec2, vec2);
3298   }
3299   bind(SCAN_TO_8_CHAR);
3300   cmpl(cnt1, stride);
3301   jcc(Assembler::less, SCAN_TO_CHAR);
3302   if (UseAVX < 2) {
3303     movdl(vec1, ch);
3304     pshuflw(vec1, vec1, 0x00);
3305     pshufd(vec1, vec1, 0);
3306     pxor(vec2, vec2);
3307   }
3308   movl(tmp, cnt1);
3309   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3310   andl(cnt1,0x00000007);  //tail count (in chars)
3311 
3312   bind(SCAN_TO_8_CHAR_LOOP);
3313   movdqu(vec3, Address(result, 0));
3314   pcmpeqw(vec3, vec1);
3315   ptest(vec2, vec3);
3316   jcc(Assembler::carryClear, FOUND_CHAR);
3317   addptr(result, 16);
3318   subl(tmp, stride);
3319   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3320   bind(SCAN_TO_CHAR);
3321   testl(cnt1, cnt1);
3322   jcc(Assembler::zero, RET_NOT_FOUND);
3323   bind(SCAN_TO_CHAR_LOOP);
3324   load_unsigned_short(tmp, Address(result, 0));
3325   cmpl(ch, tmp);
3326   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3327   addptr(result, 2);
3328   subl(cnt1, 1);
3329   jccb(Assembler::zero, RET_NOT_FOUND);
3330   jmp(SCAN_TO_CHAR_LOOP);
3331 
3332   bind(RET_NOT_FOUND);
3333   movl(result, -1);
3334   jmpb(DONE_LABEL);
3335 
3336   bind(FOUND_CHAR);
3337   if (UseAVX >= 2) {
3338     vpmovmskb(tmp, vec3);
3339   } else {
3340     pmovmskb(tmp, vec3);
3341   }
3342   bsfl(ch, tmp);
3343   addptr(result, ch);
3344 
3345   bind(FOUND_SEQ_CHAR);
3346   subptr(result, str1);
3347   shrl(result, 1);
3348 
3349   bind(DONE_LABEL);
3350 } // string_indexof_char
3351 
3352 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3353                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3354   ShortBranchVerifier sbv(this);
3355   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3356 
3357   int stride = 16;
3358 
3359   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3360         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3361         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3362         FOUND_SEQ_CHAR, DONE_LABEL;
3363 
3364   movptr(result, str1);
3365   if (UseAVX >= 2) {
3366     cmpl(cnt1, stride);
3367     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3368     cmpl(cnt1, stride*2);
3369     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3370     movdl(vec1, ch);
3371     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3372     vpxor(vec2, vec2);
3373     movl(tmp, cnt1);
3374     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3375     andl(cnt1,0x0000001F);  //tail count (in chars)
3376 
3377     bind(SCAN_TO_32_CHAR_LOOP);
3378     vmovdqu(vec3, Address(result, 0));
3379     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3380     vptest(vec2, vec3);
3381     jcc(Assembler::carryClear, FOUND_CHAR);
3382     addptr(result, 32);
3383     subl(tmp, stride*2);
3384     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3385     jmp(SCAN_TO_16_CHAR);
3386 
3387     bind(SCAN_TO_16_CHAR_INIT);
3388     movdl(vec1, ch);
3389     pxor(vec2, vec2);
3390     pshufb(vec1, vec2);
3391   }
3392 
3393   bind(SCAN_TO_16_CHAR);
3394   cmpl(cnt1, stride);
3395   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3396   if (UseAVX < 2) {
3397     movdl(vec1, ch);
3398     pxor(vec2, vec2);
3399     pshufb(vec1, vec2);
3400   }
3401   movl(tmp, cnt1);
3402   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3403   andl(cnt1,0x0000000F);  //tail count (in bytes)
3404 
3405   bind(SCAN_TO_16_CHAR_LOOP);
3406   movdqu(vec3, Address(result, 0));
3407   pcmpeqb(vec3, vec1);
3408   ptest(vec2, vec3);
3409   jcc(Assembler::carryClear, FOUND_CHAR);
3410   addptr(result, 16);
3411   subl(tmp, stride);
3412   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3413 
3414   bind(SCAN_TO_CHAR_INIT);
3415   testl(cnt1, cnt1);
3416   jcc(Assembler::zero, RET_NOT_FOUND);
3417   bind(SCAN_TO_CHAR_LOOP);
3418   load_unsigned_byte(tmp, Address(result, 0));
3419   cmpl(ch, tmp);
3420   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3421   addptr(result, 1);
3422   subl(cnt1, 1);
3423   jccb(Assembler::zero, RET_NOT_FOUND);
3424   jmp(SCAN_TO_CHAR_LOOP);
3425 
3426   bind(RET_NOT_FOUND);
3427   movl(result, -1);
3428   jmpb(DONE_LABEL);
3429 
3430   bind(FOUND_CHAR);
3431   if (UseAVX >= 2) {
3432     vpmovmskb(tmp, vec3);
3433   } else {
3434     pmovmskb(tmp, vec3);
3435   }
3436   bsfl(ch, tmp);
3437   addptr(result, ch);
3438 
3439   bind(FOUND_SEQ_CHAR);
3440   subptr(result, str1);
3441 
3442   bind(DONE_LABEL);
3443 } // stringL_indexof_char
3444 
3445 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3446   switch (eltype) {
3447   case T_BOOLEAN: return sizeof(jboolean);
3448   case T_BYTE:  return sizeof(jbyte);
3449   case T_SHORT: return sizeof(jshort);
3450   case T_CHAR:  return sizeof(jchar);
3451   case T_INT:   return sizeof(jint);
3452   default:
3453     ShouldNotReachHere();
3454     return -1;
3455   }
3456 }
3457 
3458 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3459   switch (eltype) {
3460   // T_BOOLEAN used as surrogate for unsigned byte
3461   case T_BOOLEAN: movzbl(dst, src);   break;
3462   case T_BYTE:    movsbl(dst, src);   break;
3463   case T_SHORT:   movswl(dst, src);   break;
3464   case T_CHAR:    movzwl(dst, src);   break;
3465   case T_INT:     movl(dst, src);     break;
3466   default:
3467     ShouldNotReachHere();
3468   }
3469 }
3470 
3471 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3472   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3473 }
3474 
3475 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3476   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3477 }
3478 
3479 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3480   const int vlen = Assembler::AVX_256bit;
3481   switch (eltype) {
3482   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3483   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3484   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3485   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3486   case T_INT:
3487     // do nothing
3488     break;
3489   default:
3490     ShouldNotReachHere();
3491   }
3492 }
3493 
3494 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3495                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3496                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3497                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3498                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3499                                         BasicType eltype) {
3500   ShortBranchVerifier sbv(this);
3501   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3502   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3503   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3504 
3505   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3506         SHORT_UNROLLED_LOOP_EXIT,
3507         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3508         UNROLLED_VECTOR_LOOP_BEGIN,
3509         END;
3510   switch (eltype) {
3511   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3512   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3513   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3514   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3515   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3516   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3517   }
3518 
3519   // For "renaming" for readibility of the code
3520   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3521                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3522                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3523 
3524   const int elsize = arrays_hashcode_elsize(eltype);
3525 
3526   /*
3527     if (cnt1 >= 2) {
3528       if (cnt1 >= 32) {
3529         UNROLLED VECTOR LOOP
3530       }
3531       UNROLLED SCALAR LOOP
3532     }
3533     SINGLE SCALAR
3534    */
3535 
3536   cmpl(cnt1, 32);
3537   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3538 
3539   // cnt1 >= 32 && generate_vectorized_loop
3540   xorl(index, index);
3541 
3542   // vresult = IntVector.zero(I256);
3543   for (int idx = 0; idx < 4; idx++) {
3544     vpxor(vresult[idx], vresult[idx]);
3545   }
3546   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3547   Register bound = tmp2;
3548   Register next = tmp3;
3549   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3550   movl(next, Address(tmp2, 0));
3551   movdl(vnext, next);
3552   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3553 
3554   // index = 0;
3555   // bound = cnt1 & ~(32 - 1);
3556   movl(bound, cnt1);
3557   andl(bound, ~(32 - 1));
3558   // for (; index < bound; index += 32) {
3559   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3560   // result *= next;
3561   imull(result, next);
3562   // loop fission to upfront the cost of fetching from memory, OOO execution
3563   // can then hopefully do a better job of prefetching
3564   for (int idx = 0; idx < 4; idx++) {
3565     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3566   }
3567   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3568   for (int idx = 0; idx < 4; idx++) {
3569     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3570     arrays_hashcode_elvcast(vtmp[idx], eltype);
3571     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3572   }
3573   // index += 32;
3574   addl(index, 32);
3575   // index < bound;
3576   cmpl(index, bound);
3577   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3578   // }
3579 
3580   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3581   subl(cnt1, bound);
3582   // release bound
3583 
3584   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3585   for (int idx = 0; idx < 4; idx++) {
3586     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3587     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3588     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3589   }
3590   // result += vresult.reduceLanes(ADD);
3591   for (int idx = 0; idx < 4; idx++) {
3592     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3593   }
3594 
3595   // } else if (cnt1 < 32) {
3596 
3597   bind(SHORT_UNROLLED_BEGIN);
3598   // int i = 1;
3599   movl(index, 1);
3600   cmpl(index, cnt1);
3601   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3602 
3603   // for (; i < cnt1 ; i += 2) {
3604   bind(SHORT_UNROLLED_LOOP_BEGIN);
3605   movl(tmp3, 961);
3606   imull(result, tmp3);
3607   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3608   movl(tmp3, tmp2);
3609   shll(tmp3, 5);
3610   subl(tmp3, tmp2);
3611   addl(result, tmp3);
3612   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3613   addl(result, tmp3);
3614   addl(index, 2);
3615   cmpl(index, cnt1);
3616   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3617 
3618   // }
3619   // if (i >= cnt1) {
3620   bind(SHORT_UNROLLED_LOOP_EXIT);
3621   jccb(Assembler::greater, END);
3622   movl(tmp2, result);
3623   shll(result, 5);
3624   subl(result, tmp2);
3625   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3626   addl(result, tmp3);
3627   // }
3628   bind(END);
3629 
3630   BLOCK_COMMENT("} // arrays_hashcode");
3631 
3632 } // arrays_hashcode
3633 
3634 // helper function for string_compare
3635 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3636                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3637                                            Address::ScaleFactor scale2, Register index, int ae) {
3638   if (ae == StrIntrinsicNode::LL) {
3639     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3640     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3641   } else if (ae == StrIntrinsicNode::UU) {
3642     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3643     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3644   } else {
3645     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3646     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3647   }
3648 }
3649 
3650 // Compare strings, used for char[] and byte[].
3651 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3652                                        Register cnt1, Register cnt2, Register result,
3653                                        XMMRegister vec1, int ae, KRegister mask) {
3654   ShortBranchVerifier sbv(this);
3655   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3656   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only AVX3
3657   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3658   int stride2x2 = 0x40;
3659   Address::ScaleFactor scale = Address::no_scale;
3660   Address::ScaleFactor scale1 = Address::no_scale;
3661   Address::ScaleFactor scale2 = Address::no_scale;
3662 
3663   if (ae != StrIntrinsicNode::LL) {
3664     stride2x2 = 0x20;
3665   }
3666 
3667   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3668     shrl(cnt2, 1);
3669   }
3670   // Compute the minimum of the string lengths and the
3671   // difference of the string lengths (stack).
3672   // Do the conditional move stuff
3673   movl(result, cnt1);
3674   subl(cnt1, cnt2);
3675   push(cnt1);
3676   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3677 
3678   // Is the minimum length zero?
3679   testl(cnt2, cnt2);
3680   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3681   if (ae == StrIntrinsicNode::LL) {
3682     // Load first bytes
3683     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3684     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3685   } else if (ae == StrIntrinsicNode::UU) {
3686     // Load first characters
3687     load_unsigned_short(result, Address(str1, 0));
3688     load_unsigned_short(cnt1, Address(str2, 0));
3689   } else {
3690     load_unsigned_byte(result, Address(str1, 0));
3691     load_unsigned_short(cnt1, Address(str2, 0));
3692   }
3693   subl(result, cnt1);
3694   jcc(Assembler::notZero,  POP_LABEL);
3695 
3696   if (ae == StrIntrinsicNode::UU) {
3697     // Divide length by 2 to get number of chars
3698     shrl(cnt2, 1);
3699   }
3700   cmpl(cnt2, 1);
3701   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3702 
3703   // Check if the strings start at the same location and setup scale and stride
3704   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3705     cmpptr(str1, str2);
3706     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3707     if (ae == StrIntrinsicNode::LL) {
3708       scale = Address::times_1;
3709       stride = 16;
3710     } else {
3711       scale = Address::times_2;
3712       stride = 8;
3713     }
3714   } else {
3715     scale1 = Address::times_1;
3716     scale2 = Address::times_2;
3717     // scale not used
3718     stride = 8;
3719   }
3720 
3721   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3722     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3723     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3724     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3725     Label COMPARE_TAIL_LONG;
3726     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only AVX3
3727 
3728     int pcmpmask = 0x19;
3729     if (ae == StrIntrinsicNode::LL) {
3730       pcmpmask &= ~0x01;
3731     }
3732 
3733     // Setup to compare 16-chars (32-bytes) vectors,
3734     // start from first character again because it has aligned address.
3735     if (ae == StrIntrinsicNode::LL) {
3736       stride2 = 32;
3737     } else {
3738       stride2 = 16;
3739     }
3740     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3741       adr_stride = stride << scale;
3742     } else {
3743       adr_stride1 = 8;  //stride << scale1;
3744       adr_stride2 = 16; //stride << scale2;
3745     }
3746 
3747     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3748     // rax and rdx are used by pcmpestri as elements counters
3749     movl(result, cnt2);
3750     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3751     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3752 
3753     // fast path : compare first 2 8-char vectors.
3754     bind(COMPARE_16_CHARS);
3755     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3756       movdqu(vec1, Address(str1, 0));
3757     } else {
3758       pmovzxbw(vec1, Address(str1, 0));
3759     }
3760     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3761     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3762 
3763     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3764       movdqu(vec1, Address(str1, adr_stride));
3765       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3766     } else {
3767       pmovzxbw(vec1, Address(str1, adr_stride1));
3768       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3769     }
3770     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3771     addl(cnt1, stride);
3772 
3773     // Compare the characters at index in cnt1
3774     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3775     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3776     subl(result, cnt2);
3777     jmp(POP_LABEL);
3778 
3779     // Setup the registers to start vector comparison loop
3780     bind(COMPARE_WIDE_VECTORS);
3781     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3782       lea(str1, Address(str1, result, scale));
3783       lea(str2, Address(str2, result, scale));
3784     } else {
3785       lea(str1, Address(str1, result, scale1));
3786       lea(str2, Address(str2, result, scale2));
3787     }
3788     subl(result, stride2);
3789     subl(cnt2, stride2);
3790     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3791     negptr(result);
3792 
3793     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3794     bind(COMPARE_WIDE_VECTORS_LOOP);
3795 
3796     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3797       cmpl(cnt2, stride2x2);
3798       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3799       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3800       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3801 
3802       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3803       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3804         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3805         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3806       } else {
3807         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3808         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3809       }
3810       kortestql(mask, mask);
3811       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3812       addptr(result, stride2x2);  // update since we already compared at this addr
3813       subl(cnt2, stride2x2);      // and sub the size too
3814       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3815 
3816       vpxor(vec1, vec1);
3817       jmpb(COMPARE_WIDE_TAIL);
3818     }//if (VM_Version::supports_avx512vlbw())
3819 
3820     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3821     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3822       vmovdqu(vec1, Address(str1, result, scale));
3823       vpxor(vec1, Address(str2, result, scale));
3824     } else {
3825       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3826       vpxor(vec1, Address(str2, result, scale2));
3827     }
3828     vptest(vec1, vec1);
3829     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3830     addptr(result, stride2);
3831     subl(cnt2, stride2);
3832     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3833     // clean upper bits of YMM registers
3834     vpxor(vec1, vec1);
3835 
3836     // compare wide vectors tail
3837     bind(COMPARE_WIDE_TAIL);
3838     testptr(result, result);
3839     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3840 
3841     movl(result, stride2);
3842     movl(cnt2, result);
3843     negptr(result);
3844     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3845 
3846     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3847     bind(VECTOR_NOT_EQUAL);
3848     // clean upper bits of YMM registers
3849     vpxor(vec1, vec1);
3850     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3851       lea(str1, Address(str1, result, scale));
3852       lea(str2, Address(str2, result, scale));
3853     } else {
3854       lea(str1, Address(str1, result, scale1));
3855       lea(str2, Address(str2, result, scale2));
3856     }
3857     jmp(COMPARE_16_CHARS);
3858 
3859     // Compare tail chars, length between 1 to 15 chars
3860     bind(COMPARE_TAIL_LONG);
3861     movl(cnt2, result);
3862     cmpl(cnt2, stride);
3863     jcc(Assembler::less, COMPARE_SMALL_STR);
3864 
3865     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3866       movdqu(vec1, Address(str1, 0));
3867     } else {
3868       pmovzxbw(vec1, Address(str1, 0));
3869     }
3870     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3871     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3872     subptr(cnt2, stride);
3873     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3874     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3875       lea(str1, Address(str1, result, scale));
3876       lea(str2, Address(str2, result, scale));
3877     } else {
3878       lea(str1, Address(str1, result, scale1));
3879       lea(str2, Address(str2, result, scale2));
3880     }
3881     negptr(cnt2);
3882     jmpb(WHILE_HEAD_LABEL);
3883 
3884     bind(COMPARE_SMALL_STR);
3885   } else if (UseSSE42Intrinsics) {
3886     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3887     int pcmpmask = 0x19;
3888     // Setup to compare 8-char (16-byte) vectors,
3889     // start from first character again because it has aligned address.
3890     movl(result, cnt2);
3891     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3892     if (ae == StrIntrinsicNode::LL) {
3893       pcmpmask &= ~0x01;
3894     }
3895     jcc(Assembler::zero, COMPARE_TAIL);
3896     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3897       lea(str1, Address(str1, result, scale));
3898       lea(str2, Address(str2, result, scale));
3899     } else {
3900       lea(str1, Address(str1, result, scale1));
3901       lea(str2, Address(str2, result, scale2));
3902     }
3903     negptr(result);
3904 
3905     // pcmpestri
3906     //   inputs:
3907     //     vec1- substring
3908     //     rax - negative string length (elements count)
3909     //     mem - scanned string
3910     //     rdx - string length (elements count)
3911     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3912     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3913     //   outputs:
3914     //     rcx - first mismatched element index
3915     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3916 
3917     bind(COMPARE_WIDE_VECTORS);
3918     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3919       movdqu(vec1, Address(str1, result, scale));
3920       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3921     } else {
3922       pmovzxbw(vec1, Address(str1, result, scale1));
3923       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3924     }
3925     // After pcmpestri cnt1(rcx) contains mismatched element index
3926 
3927     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3928     addptr(result, stride);
3929     subptr(cnt2, stride);
3930     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3931 
3932     // compare wide vectors tail
3933     testptr(result, result);
3934     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3935 
3936     movl(cnt2, stride);
3937     movl(result, stride);
3938     negptr(result);
3939     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3940       movdqu(vec1, Address(str1, result, scale));
3941       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3942     } else {
3943       pmovzxbw(vec1, Address(str1, result, scale1));
3944       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3945     }
3946     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3947 
3948     // Mismatched characters in the vectors
3949     bind(VECTOR_NOT_EQUAL);
3950     addptr(cnt1, result);
3951     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3952     subl(result, cnt2);
3953     jmpb(POP_LABEL);
3954 
3955     bind(COMPARE_TAIL); // limit is zero
3956     movl(cnt2, result);
3957     // Fallthru to tail compare
3958   }
3959   // Shift str2 and str1 to the end of the arrays, negate min
3960   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3961     lea(str1, Address(str1, cnt2, scale));
3962     lea(str2, Address(str2, cnt2, scale));
3963   } else {
3964     lea(str1, Address(str1, cnt2, scale1));
3965     lea(str2, Address(str2, cnt2, scale2));
3966   }
3967   decrementl(cnt2);  // first character was compared already
3968   negptr(cnt2);
3969 
3970   // Compare the rest of the elements
3971   bind(WHILE_HEAD_LABEL);
3972   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3973   subl(result, cnt1);
3974   jccb(Assembler::notZero, POP_LABEL);
3975   increment(cnt2);
3976   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3977 
3978   // Strings are equal up to min length.  Return the length difference.
3979   bind(LENGTH_DIFF_LABEL);
3980   pop(result);
3981   if (ae == StrIntrinsicNode::UU) {
3982     // Divide diff by 2 to get number of chars
3983     sarl(result, 1);
3984   }
3985   jmpb(DONE_LABEL);
3986 
3987   if (VM_Version::supports_avx512vlbw()) {
3988 
3989     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3990 
3991     kmovql(cnt1, mask);
3992     notq(cnt1);
3993     bsfq(cnt2, cnt1);
3994     if (ae != StrIntrinsicNode::LL) {
3995       // Divide diff by 2 to get number of chars
3996       sarl(cnt2, 1);
3997     }
3998     addq(result, cnt2);
3999     if (ae == StrIntrinsicNode::LL) {
4000       load_unsigned_byte(cnt1, Address(str2, result));
4001       load_unsigned_byte(result, Address(str1, result));
4002     } else if (ae == StrIntrinsicNode::UU) {
4003       load_unsigned_short(cnt1, Address(str2, result, scale));
4004       load_unsigned_short(result, Address(str1, result, scale));
4005     } else {
4006       load_unsigned_short(cnt1, Address(str2, result, scale2));
4007       load_unsigned_byte(result, Address(str1, result, scale1));
4008     }
4009     subl(result, cnt1);
4010     jmpb(POP_LABEL);
4011   }//if (VM_Version::supports_avx512vlbw())
4012 
4013   // Discard the stored length difference
4014   bind(POP_LABEL);
4015   pop(cnt1);
4016 
4017   // That's it
4018   bind(DONE_LABEL);
4019   if(ae == StrIntrinsicNode::UL) {
4020     negl(result);
4021   }
4022 
4023 }
4024 
4025 // Search for Non-ASCII character (Negative byte value) in a byte array,
4026 // return the index of the first such character, otherwise the length
4027 // of the array segment searched.
4028 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4029 //   @IntrinsicCandidate
4030 //   public static int countPositives(byte[] ba, int off, int len) {
4031 //     for (int i = off; i < off + len; i++) {
4032 //       if (ba[i] < 0) {
4033 //         return i - off;
4034 //       }
4035 //     }
4036 //     return len;
4037 //   }
4038 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4039   Register result, Register tmp1,
4040   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4041   // rsi: byte array
4042   // rcx: len
4043   // rax: result
4044   ShortBranchVerifier sbv(this);
4045   assert_different_registers(ary1, len, result, tmp1);
4046   assert_different_registers(vec1, vec2);
4047   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4048 
4049   movl(result, len); // copy
4050   // len == 0
4051   testl(len, len);
4052   jcc(Assembler::zero, DONE);
4053 
4054   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4055     VM_Version::supports_avx512vlbw() &&
4056     VM_Version::supports_bmi2()) {
4057 
4058     Label test_64_loop, test_tail, BREAK_LOOP;
4059     movl(tmp1, len);
4060     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4061 
4062     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4063     andl(len,  0xffffffc0); // vector count (in chars)
4064     jccb(Assembler::zero, test_tail);
4065 
4066     lea(ary1, Address(ary1, len, Address::times_1));
4067     negptr(len);
4068 
4069     bind(test_64_loop);
4070     // Check whether our 64 elements of size byte contain negatives
4071     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4072     kortestql(mask1, mask1);
4073     jcc(Assembler::notZero, BREAK_LOOP);
4074 
4075     addptr(len, 64);
4076     jccb(Assembler::notZero, test_64_loop);
4077 
4078     bind(test_tail);
4079     // bail out when there is nothing to be done
4080     testl(tmp1, -1);
4081     jcc(Assembler::zero, DONE);
4082 
4083 
4084     // check the tail for absense of negatives
4085     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4086     {
4087       Register tmp3_aliased = len;
4088       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4089       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4090       notq(tmp3_aliased);
4091       kmovql(mask2, tmp3_aliased);
4092     }
4093 
4094     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4095     ktestq(mask1, mask2);
4096     jcc(Assembler::zero, DONE);
4097 
4098     // do a full check for negative registers in the tail
4099     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4100                      // ary1 already pointing to the right place
4101     jmpb(TAIL_START);
4102 
4103     bind(BREAK_LOOP);
4104     // At least one byte in the last 64 byte block was negative.
4105     // Set up to look at the last 64 bytes as if they were a tail
4106     lea(ary1, Address(ary1, len, Address::times_1));
4107     addptr(result, len);
4108     // Ignore the very last byte: if all others are positive,
4109     // it must be negative, so we can skip right to the 2+1 byte
4110     // end comparison at this point
4111     orl(result, 63);
4112     movl(len, 63);
4113     // Fallthru to tail compare
4114   } else {
4115 
4116     if (UseAVX >= 2) {
4117       // With AVX2, use 32-byte vector compare
4118       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4119 
4120       // Compare 32-byte vectors
4121       testl(len, 0xffffffe0);   // vector count (in bytes)
4122       jccb(Assembler::zero, TAIL_START);
4123 
4124       andl(len, 0xffffffe0);
4125       lea(ary1, Address(ary1, len, Address::times_1));
4126       negptr(len);
4127 
4128       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4129       movdl(vec2, tmp1);
4130       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4131 
4132       bind(COMPARE_WIDE_VECTORS);
4133       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4134       vptest(vec1, vec2);
4135       jccb(Assembler::notZero, BREAK_LOOP);
4136       addptr(len, 32);
4137       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4138 
4139       testl(result, 0x0000001f);   // any bytes remaining?
4140       jcc(Assembler::zero, DONE);
4141 
4142       // Quick test using the already prepared vector mask
4143       movl(len, result);
4144       andl(len, 0x0000001f);
4145       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4146       vptest(vec1, vec2);
4147       jcc(Assembler::zero, DONE);
4148       // There are zeros, jump to the tail to determine exactly where
4149       jmpb(TAIL_START);
4150 
4151       bind(BREAK_LOOP);
4152       // At least one byte in the last 32-byte vector is negative.
4153       // Set up to look at the last 32 bytes as if they were a tail
4154       lea(ary1, Address(ary1, len, Address::times_1));
4155       addptr(result, len);
4156       // Ignore the very last byte: if all others are positive,
4157       // it must be negative, so we can skip right to the 2+1 byte
4158       // end comparison at this point
4159       orl(result, 31);
4160       movl(len, 31);
4161       // Fallthru to tail compare
4162     } else if (UseSSE42Intrinsics) {
4163       // With SSE4.2, use double quad vector compare
4164       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4165 
4166       // Compare 16-byte vectors
4167       testl(len, 0xfffffff0);   // vector count (in bytes)
4168       jcc(Assembler::zero, TAIL_START);
4169 
4170       andl(len, 0xfffffff0);
4171       lea(ary1, Address(ary1, len, Address::times_1));
4172       negptr(len);
4173 
4174       movl(tmp1, 0x80808080);
4175       movdl(vec2, tmp1);
4176       pshufd(vec2, vec2, 0);
4177 
4178       bind(COMPARE_WIDE_VECTORS);
4179       movdqu(vec1, Address(ary1, len, Address::times_1));
4180       ptest(vec1, vec2);
4181       jccb(Assembler::notZero, BREAK_LOOP);
4182       addptr(len, 16);
4183       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4184 
4185       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4186       jcc(Assembler::zero, DONE);
4187 
4188       // Quick test using the already prepared vector mask
4189       movl(len, result);
4190       andl(len, 0x0000000f);   // tail count (in bytes)
4191       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4192       ptest(vec1, vec2);
4193       jcc(Assembler::zero, DONE);
4194       jmpb(TAIL_START);
4195 
4196       bind(BREAK_LOOP);
4197       // At least one byte in the last 16-byte vector is negative.
4198       // Set up and look at the last 16 bytes as if they were a tail
4199       lea(ary1, Address(ary1, len, Address::times_1));
4200       addptr(result, len);
4201       // Ignore the very last byte: if all others are positive,
4202       // it must be negative, so we can skip right to the 2+1 byte
4203       // end comparison at this point
4204       orl(result, 15);
4205       movl(len, 15);
4206       // Fallthru to tail compare
4207     }
4208   }
4209 
4210   bind(TAIL_START);
4211   // Compare 4-byte vectors
4212   andl(len, 0xfffffffc); // vector count (in bytes)
4213   jccb(Assembler::zero, COMPARE_CHAR);
4214 
4215   lea(ary1, Address(ary1, len, Address::times_1));
4216   negptr(len);
4217 
4218   bind(COMPARE_VECTORS);
4219   movl(tmp1, Address(ary1, len, Address::times_1));
4220   andl(tmp1, 0x80808080);
4221   jccb(Assembler::notZero, TAIL_ADJUST);
4222   addptr(len, 4);
4223   jccb(Assembler::notZero, COMPARE_VECTORS);
4224 
4225   // Compare trailing char (final 2-3 bytes), if any
4226   bind(COMPARE_CHAR);
4227 
4228   testl(result, 0x2);   // tail  char
4229   jccb(Assembler::zero, COMPARE_BYTE);
4230   load_unsigned_short(tmp1, Address(ary1, 0));
4231   andl(tmp1, 0x00008080);
4232   jccb(Assembler::notZero, CHAR_ADJUST);
4233   lea(ary1, Address(ary1, 2));
4234 
4235   bind(COMPARE_BYTE);
4236   testl(result, 0x1);   // tail  byte
4237   jccb(Assembler::zero, DONE);
4238   load_unsigned_byte(tmp1, Address(ary1, 0));
4239   testl(tmp1, 0x00000080);
4240   jccb(Assembler::zero, DONE);
4241   subptr(result, 1);
4242   jmpb(DONE);
4243 
4244   bind(TAIL_ADJUST);
4245   // there are negative bits in the last 4 byte block.
4246   // Adjust result and check the next three bytes
4247   addptr(result, len);
4248   orl(result, 3);
4249   lea(ary1, Address(ary1, len, Address::times_1));
4250   jmpb(COMPARE_CHAR);
4251 
4252   bind(CHAR_ADJUST);
4253   // We are looking at a char + optional byte tail, and found that one
4254   // of the bytes in the char is negative. Adjust the result, check the
4255   // first byte and readjust if needed.
4256   andl(result, 0xfffffffc);
4257   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4258   jccb(Assembler::notZero, DONE);
4259   addptr(result, 1);
4260 
4261   // That's it
4262   bind(DONE);
4263   if (UseAVX >= 2) {
4264     // clean upper bits of YMM registers
4265     vpxor(vec1, vec1);
4266     vpxor(vec2, vec2);
4267   }
4268 }
4269 
4270 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4271 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4272                                       Register limit, Register result, Register chr,
4273                                       XMMRegister vec1, XMMRegister vec2, bool is_char,
4274                                       KRegister mask, bool expand_ary2) {
4275   // for expand_ary2, limit is the (smaller) size of the second array.
4276   ShortBranchVerifier sbv(this);
4277   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4278 
4279   assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4280          "Expansion only implemented for AVX2");
4281 
4282   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4283   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4284 
4285   Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4286   int scaleIncr = expand_ary2 ? 8 : 16;
4287 
4288   if (is_array_equ) {
4289     // Check the input args
4290     cmpoop(ary1, ary2);
4291     jcc(Assembler::equal, TRUE_LABEL);
4292 
4293     // Need additional checks for arrays_equals.
4294     testptr(ary1, ary1);
4295     jcc(Assembler::zero, FALSE_LABEL);
4296     testptr(ary2, ary2);
4297     jcc(Assembler::zero, FALSE_LABEL);
4298 
4299     // Check the lengths
4300     movl(limit, Address(ary1, length_offset));
4301     cmpl(limit, Address(ary2, length_offset));
4302     jcc(Assembler::notEqual, FALSE_LABEL);
4303   }
4304 
4305   // count == 0
4306   testl(limit, limit);
4307   jcc(Assembler::zero, TRUE_LABEL);
4308 
4309   if (is_array_equ) {
4310     // Load array address
4311     lea(ary1, Address(ary1, base_offset));
4312     lea(ary2, Address(ary2, base_offset));
4313   }
4314 
4315   if (is_array_equ && is_char) {
4316     // arrays_equals when used for char[].
4317     shll(limit, 1);      // byte count != 0
4318   }
4319   movl(result, limit); // copy
4320 
4321   if (UseAVX >= 2) {
4322     // With AVX2, use 32-byte vector compare
4323     Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4324 
4325     // Compare 32-byte vectors
4326     if (expand_ary2) {
4327       andl(result, 0x0000000f);  //   tail count (in bytes)
4328       andl(limit, 0xfffffff0);   // vector count (in bytes)
4329       jcc(Assembler::zero, COMPARE_TAIL);
4330     } else {
4331       andl(result, 0x0000001f);  //   tail count (in bytes)
4332       andl(limit, 0xffffffe0);   // vector count (in bytes)
4333       jcc(Assembler::zero, COMPARE_TAIL_16);
4334     }
4335 
4336     lea(ary1, Address(ary1, limit, scaleFactor));
4337     lea(ary2, Address(ary2, limit, Address::times_1));
4338     negptr(limit);
4339 
4340     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4341       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4342 
4343       cmpl(limit, -64);
4344       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4345 
4346       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4347 
4348       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4349       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4350       kortestql(mask, mask);
4351       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4352       addptr(limit, 64);  // update since we already compared at this addr
4353       cmpl(limit, -64);
4354       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4355 
4356       // At this point we may still need to compare -limit+result bytes.
4357       // We could execute the next two instruction and just continue via non-wide path:
4358       //  cmpl(limit, 0);
4359       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4360       // But since we stopped at the points ary{1,2}+limit which are
4361       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4362       // (|limit| <= 32 and result < 32),
4363       // we may just compare the last 64 bytes.
4364       //
4365       addptr(result, -64);   // it is safe, bc we just came from this area
4366       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4367       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4368       kortestql(mask, mask);
4369       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4370 
4371       jmp(TRUE_LABEL);
4372 
4373       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4374 
4375     }//if (VM_Version::supports_avx512vlbw())
4376 
4377     bind(COMPARE_WIDE_VECTORS);
4378     vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4379     if (expand_ary2) {
4380       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4381     } else {
4382       vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4383     }
4384     vpxor(vec1, vec2);
4385 
4386     vptest(vec1, vec1);
4387     jcc(Assembler::notZero, FALSE_LABEL);
4388     addptr(limit, scaleIncr * 2);
4389     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4390 
4391     testl(result, result);
4392     jcc(Assembler::zero, TRUE_LABEL);
4393 
4394     vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4395     if (expand_ary2) {
4396       vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4397     } else {
4398       vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4399     }
4400     vpxor(vec1, vec2);
4401 
4402     vptest(vec1, vec1);
4403     jcc(Assembler::notZero, FALSE_LABEL);
4404     jmp(TRUE_LABEL);
4405 
4406     bind(COMPARE_TAIL_16); // limit is zero
4407     movl(limit, result);
4408 
4409     // Compare 16-byte chunks
4410     andl(result, 0x0000000f);  //   tail count (in bytes)
4411     andl(limit, 0xfffffff0);   // vector count (in bytes)
4412     jcc(Assembler::zero, COMPARE_TAIL);
4413 
4414     lea(ary1, Address(ary1, limit, scaleFactor));
4415     lea(ary2, Address(ary2, limit, Address::times_1));
4416     negptr(limit);
4417 
4418     bind(COMPARE_WIDE_VECTORS_16);
4419     movdqu(vec1, Address(ary1, limit, scaleFactor));
4420     if (expand_ary2) {
4421       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4422     } else {
4423       movdqu(vec2, Address(ary2, limit, Address::times_1));
4424     }
4425     pxor(vec1, vec2);
4426 
4427     ptest(vec1, vec1);
4428     jcc(Assembler::notZero, FALSE_LABEL);
4429     addptr(limit, scaleIncr);
4430     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4431 
4432     bind(COMPARE_TAIL); // limit is zero
4433     movl(limit, result);
4434     // Fallthru to tail compare
4435   } else if (UseSSE42Intrinsics) {
4436     // With SSE4.2, use double quad vector compare
4437     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4438 
4439     // Compare 16-byte vectors
4440     andl(result, 0x0000000f);  //   tail count (in bytes)
4441     andl(limit, 0xfffffff0);   // vector count (in bytes)
4442     jcc(Assembler::zero, COMPARE_TAIL);
4443 
4444     lea(ary1, Address(ary1, limit, Address::times_1));
4445     lea(ary2, Address(ary2, limit, Address::times_1));
4446     negptr(limit);
4447 
4448     bind(COMPARE_WIDE_VECTORS);
4449     movdqu(vec1, Address(ary1, limit, Address::times_1));
4450     movdqu(vec2, Address(ary2, limit, Address::times_1));
4451     pxor(vec1, vec2);
4452 
4453     ptest(vec1, vec1);
4454     jcc(Assembler::notZero, FALSE_LABEL);
4455     addptr(limit, 16);
4456     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4457 
4458     testl(result, result);
4459     jcc(Assembler::zero, TRUE_LABEL);
4460 
4461     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4462     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4463     pxor(vec1, vec2);
4464 
4465     ptest(vec1, vec1);
4466     jccb(Assembler::notZero, FALSE_LABEL);
4467     jmpb(TRUE_LABEL);
4468 
4469     bind(COMPARE_TAIL); // limit is zero
4470     movl(limit, result);
4471     // Fallthru to tail compare
4472   }
4473 
4474   // Compare 4-byte vectors
4475   if (expand_ary2) {
4476     testl(result, result);
4477     jccb(Assembler::zero, TRUE_LABEL);
4478   } else {
4479     andl(limit, 0xfffffffc); // vector count (in bytes)
4480     jccb(Assembler::zero, COMPARE_CHAR);
4481   }
4482 
4483   lea(ary1, Address(ary1, limit, scaleFactor));
4484   lea(ary2, Address(ary2, limit, Address::times_1));
4485   negptr(limit);
4486 
4487   bind(COMPARE_VECTORS);
4488   if (expand_ary2) {
4489     // There are no "vector" operations for bytes to shorts
4490     movzbl(chr, Address(ary2, limit, Address::times_1));
4491     cmpw(Address(ary1, limit, Address::times_2), chr);
4492     jccb(Assembler::notEqual, FALSE_LABEL);
4493     addptr(limit, 1);
4494     jcc(Assembler::notZero, COMPARE_VECTORS);
4495     jmp(TRUE_LABEL);
4496   } else {
4497     movl(chr, Address(ary1, limit, Address::times_1));
4498     cmpl(chr, Address(ary2, limit, Address::times_1));
4499     jccb(Assembler::notEqual, FALSE_LABEL);
4500     addptr(limit, 4);
4501     jcc(Assembler::notZero, COMPARE_VECTORS);
4502   }
4503 
4504   // Compare trailing char (final 2 bytes), if any
4505   bind(COMPARE_CHAR);
4506   testl(result, 0x2);   // tail  char
4507   jccb(Assembler::zero, COMPARE_BYTE);
4508   load_unsigned_short(chr, Address(ary1, 0));
4509   load_unsigned_short(limit, Address(ary2, 0));
4510   cmpl(chr, limit);
4511   jccb(Assembler::notEqual, FALSE_LABEL);
4512 
4513   if (is_array_equ && is_char) {
4514     bind(COMPARE_BYTE);
4515   } else {
4516     lea(ary1, Address(ary1, 2));
4517     lea(ary2, Address(ary2, 2));
4518 
4519     bind(COMPARE_BYTE);
4520     testl(result, 0x1);   // tail  byte
4521     jccb(Assembler::zero, TRUE_LABEL);
4522     load_unsigned_byte(chr, Address(ary1, 0));
4523     load_unsigned_byte(limit, Address(ary2, 0));
4524     cmpl(chr, limit);
4525     jccb(Assembler::notEqual, FALSE_LABEL);
4526   }
4527   bind(TRUE_LABEL);
4528   movl(result, 1);   // return true
4529   jmpb(DONE);
4530 
4531   bind(FALSE_LABEL);
4532   xorl(result, result); // return false
4533 
4534   // That's it
4535   bind(DONE);
4536   if (UseAVX >= 2) {
4537     // clean upper bits of YMM registers
4538     vpxor(vec1, vec1);
4539     vpxor(vec2, vec2);
4540   }
4541 }
4542 
4543 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4544 #define __ masm.
4545   Register dst = stub.data<0>();
4546   XMMRegister src = stub.data<1>();
4547   address target = stub.data<2>();
4548   __ bind(stub.entry());
4549   __ subptr(rsp, 8);
4550   __ movdbl(Address(rsp), src);
4551   __ call(RuntimeAddress(target));
4552   // APX REX2 encoding for pop(dst) increases the stub size by 1 byte.
4553   __ pop(dst);
4554   __ jmp(stub.continuation());
4555 #undef __
4556 }
4557 
4558 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4559   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4560   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4561 
4562   address slowpath_target;
4563   if (dst_bt == T_INT) {
4564     if (src_bt == T_FLOAT) {
4565       cvttss2sil(dst, src);
4566       cmpl(dst, 0x80000000);
4567       slowpath_target = StubRoutines::x86::f2i_fixup();
4568     } else {
4569       cvttsd2sil(dst, src);
4570       cmpl(dst, 0x80000000);
4571       slowpath_target = StubRoutines::x86::d2i_fixup();
4572     }
4573   } else {
4574     if (src_bt == T_FLOAT) {
4575       cvttss2siq(dst, src);
4576       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4577       slowpath_target = StubRoutines::x86::f2l_fixup();
4578     } else {
4579       cvttsd2siq(dst, src);
4580       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4581       slowpath_target = StubRoutines::x86::d2l_fixup();
4582     }
4583   }
4584 
4585   // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte.
4586   int max_size = 23 + (UseAPX ? 1 : 0);
4587   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath);
4588   jcc(Assembler::equal, stub->entry());
4589   bind(stub->continuation());
4590 }
4591 
4592 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4593                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4594   switch(ideal_opc) {
4595     case Op_LShiftVS:
4596       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4597     case Op_LShiftVI:
4598       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4599     case Op_LShiftVL:
4600       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4601     case Op_RShiftVS:
4602       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4603     case Op_RShiftVI:
4604       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4605     case Op_RShiftVL:
4606       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4607     case Op_URShiftVS:
4608       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4609     case Op_URShiftVI:
4610       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4611     case Op_URShiftVL:
4612       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4613     case Op_RotateRightV:
4614       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4615     case Op_RotateLeftV:
4616       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4617     default:
4618       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4619       break;
4620   }
4621 }
4622 
4623 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4624                                                XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4625   if (is_unsigned) {
4626     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4627   } else {
4628     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4629   }
4630 }
4631 
4632 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4633                                                       XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4634   switch (elem_bt) {
4635     case T_BYTE:
4636       if (ideal_opc == Op_SaturatingAddV) {
4637         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4638       } else {
4639         assert(ideal_opc == Op_SaturatingSubV, "");
4640         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4641       }
4642       break;
4643     case T_SHORT:
4644       if (ideal_opc == Op_SaturatingAddV) {
4645         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4646       } else {
4647         assert(ideal_opc == Op_SaturatingSubV, "");
4648         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4649       }
4650       break;
4651     default:
4652       fatal("Unsupported type %s", type2name(elem_bt));
4653       break;
4654   }
4655 }
4656 
4657 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4658                                                         XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4659   switch (elem_bt) {
4660     case T_BYTE:
4661       if (ideal_opc == Op_SaturatingAddV) {
4662         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4663       } else {
4664         assert(ideal_opc == Op_SaturatingSubV, "");
4665         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4666       }
4667       break;
4668     case T_SHORT:
4669       if (ideal_opc == Op_SaturatingAddV) {
4670         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4671       } else {
4672         assert(ideal_opc == Op_SaturatingSubV, "");
4673         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4674       }
4675       break;
4676     default:
4677       fatal("Unsupported type %s", type2name(elem_bt));
4678       break;
4679   }
4680 }
4681 
4682 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4683                                                Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4684   if (is_unsigned) {
4685     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4686   } else {
4687     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4688   }
4689 }
4690 
4691 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4692                                                       XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4693   switch (elem_bt) {
4694     case T_BYTE:
4695       if (ideal_opc == Op_SaturatingAddV) {
4696         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4697       } else {
4698         assert(ideal_opc == Op_SaturatingSubV, "");
4699         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4700       }
4701       break;
4702     case T_SHORT:
4703       if (ideal_opc == Op_SaturatingAddV) {
4704         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4705       } else {
4706         assert(ideal_opc == Op_SaturatingSubV, "");
4707         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4708       }
4709       break;
4710     default:
4711       fatal("Unsupported type %s", type2name(elem_bt));
4712       break;
4713   }
4714 }
4715 
4716 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4717                                                         XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4718   switch (elem_bt) {
4719     case T_BYTE:
4720       if (ideal_opc == Op_SaturatingAddV) {
4721         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4722       } else {
4723         assert(ideal_opc == Op_SaturatingSubV, "");
4724         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4725       }
4726       break;
4727     case T_SHORT:
4728       if (ideal_opc == Op_SaturatingAddV) {
4729         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4730       } else {
4731         assert(ideal_opc == Op_SaturatingSubV, "");
4732         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4733       }
4734       break;
4735     default:
4736       fatal("Unsupported type %s", type2name(elem_bt));
4737       break;
4738   }
4739 }
4740 
4741 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4742                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4743                                     bool is_varshift) {
4744   switch (ideal_opc) {
4745     case Op_AddVB:
4746       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4747     case Op_AddVS:
4748       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4749     case Op_AddVI:
4750       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4751     case Op_AddVL:
4752       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4753     case Op_AddVF:
4754       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4755     case Op_AddVD:
4756       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4757     case Op_SubVB:
4758       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4759     case Op_SubVS:
4760       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4761     case Op_SubVI:
4762       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4763     case Op_SubVL:
4764       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4765     case Op_SubVF:
4766       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4767     case Op_SubVD:
4768       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4769     case Op_MulVS:
4770       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4771     case Op_MulVI:
4772       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4773     case Op_MulVL:
4774       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4775     case Op_MulVF:
4776       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4777     case Op_MulVD:
4778       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4779     case Op_DivVF:
4780       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4781     case Op_DivVD:
4782       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4783     case Op_SqrtVF:
4784       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4785     case Op_SqrtVD:
4786       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4787     case Op_AbsVB:
4788       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4789     case Op_AbsVS:
4790       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4791     case Op_AbsVI:
4792       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4793     case Op_AbsVL:
4794       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4795     case Op_FmaVF:
4796       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4797     case Op_FmaVD:
4798       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4799     case Op_VectorRearrange:
4800       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4801     case Op_LShiftVS:
4802       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4803     case Op_LShiftVI:
4804       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4805     case Op_LShiftVL:
4806       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4807     case Op_RShiftVS:
4808       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4809     case Op_RShiftVI:
4810       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4811     case Op_RShiftVL:
4812       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4813     case Op_URShiftVS:
4814       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4815     case Op_URShiftVI:
4816       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4817     case Op_URShiftVL:
4818       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4819     case Op_RotateLeftV:
4820       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4821     case Op_RotateRightV:
4822       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4823     case Op_MaxV:
4824       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4825     case Op_MinV:
4826       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4827     case Op_UMinV:
4828       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4829     case Op_UMaxV:
4830       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4831     case Op_XorV:
4832       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4833     case Op_OrV:
4834       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4835     case Op_AndV:
4836       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4837     default:
4838       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4839       break;
4840   }
4841 }
4842 
4843 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4844                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4845   switch (ideal_opc) {
4846     case Op_AddVB:
4847       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4848     case Op_AddVS:
4849       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4850     case Op_AddVI:
4851       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4852     case Op_AddVL:
4853       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4854     case Op_AddVF:
4855       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4856     case Op_AddVD:
4857       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4858     case Op_SubVB:
4859       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4860     case Op_SubVS:
4861       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4862     case Op_SubVI:
4863       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4864     case Op_SubVL:
4865       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4866     case Op_SubVF:
4867       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4868     case Op_SubVD:
4869       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4870     case Op_MulVS:
4871       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4872     case Op_MulVI:
4873       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4874     case Op_MulVL:
4875       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4876     case Op_MulVF:
4877       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4878     case Op_MulVD:
4879       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4880     case Op_DivVF:
4881       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4882     case Op_DivVD:
4883       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4884     case Op_FmaVF:
4885       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4886     case Op_FmaVD:
4887       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4888     case Op_MaxV:
4889       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4890     case Op_MinV:
4891       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4892     case Op_UMaxV:
4893       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4894     case Op_UMinV:
4895       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4896     case Op_XorV:
4897       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4898     case Op_OrV:
4899       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4900     case Op_AndV:
4901       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4902     default:
4903       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4904       break;
4905   }
4906 }
4907 
4908 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4909                                   KRegister src1, KRegister src2) {
4910   BasicType etype = T_ILLEGAL;
4911   switch(mask_len) {
4912     case 2:
4913     case 4:
4914     case 8:  etype = T_BYTE; break;
4915     case 16: etype = T_SHORT; break;
4916     case 32: etype = T_INT; break;
4917     case 64: etype = T_LONG; break;
4918     default: fatal("Unsupported type"); break;
4919   }
4920   assert(etype != T_ILLEGAL, "");
4921   switch(ideal_opc) {
4922     case Op_AndVMask:
4923       kand(etype, dst, src1, src2); break;
4924     case Op_OrVMask:
4925       kor(etype, dst, src1, src2); break;
4926     case Op_XorVMask:
4927       kxor(etype, dst, src1, src2); break;
4928     default:
4929       fatal("Unsupported masked operation"); break;
4930   }
4931 }
4932 
4933 /*
4934  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4935  * If src is NaN, the result is 0.
4936  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4937  * the result is equal to the value of Integer.MIN_VALUE.
4938  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4939  * the result is equal to the value of Integer.MAX_VALUE.
4940  */
4941 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4942                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4943                                                                    Register rscratch, AddressLiteral float_sign_flip,
4944                                                                    int vec_enc) {
4945   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4946   Label done;
4947   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4948   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4949   vptest(xtmp2, xtmp2, vec_enc);
4950   jccb(Assembler::equal, done);
4951 
4952   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4953   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4954 
4955   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4956   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4957   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4958 
4959   // Recompute the mask for remaining special value.
4960   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4961   // Extract SRC values corresponding to TRUE mask lanes.
4962   vpand(xtmp4, xtmp2, src, vec_enc);
4963   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4964   // values are set.
4965   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4966 
4967   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4968   bind(done);
4969 }
4970 
4971 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4972                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4973                                                                     Register rscratch, AddressLiteral float_sign_flip,
4974                                                                     int vec_enc) {
4975   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4976   Label done;
4977   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4978   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4979   kortestwl(ktmp1, ktmp1);
4980   jccb(Assembler::equal, done);
4981 
4982   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4983   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4984   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4985 
4986   kxorwl(ktmp1, ktmp1, ktmp2);
4987   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4988   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4989   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4990   bind(done);
4991 }
4992 
4993 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4994                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4995                                                                      Register rscratch, AddressLiteral double_sign_flip,
4996                                                                      int vec_enc) {
4997   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4998 
4999   Label done;
5000   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5001   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
5002   kortestwl(ktmp1, ktmp1);
5003   jccb(Assembler::equal, done);
5004 
5005   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5006   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5007   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5008 
5009   kxorwl(ktmp1, ktmp1, ktmp2);
5010   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5011   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5012   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5013   bind(done);
5014 }
5015 
5016 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5017                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5018                                                                      Register rscratch, AddressLiteral float_sign_flip,
5019                                                                      int vec_enc) {
5020   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5021   Label done;
5022   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5023   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5024   kortestwl(ktmp1, ktmp1);
5025   jccb(Assembler::equal, done);
5026 
5027   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5028   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5029   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5030 
5031   kxorwl(ktmp1, ktmp1, ktmp2);
5032   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5033   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5034   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5035   bind(done);
5036 }
5037 
5038 /*
5039  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5040  * If src is NaN, the result is 0.
5041  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
5042  * the result is equal to the value of Long.MIN_VALUE.
5043  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
5044  * the result is equal to the value of Long.MAX_VALUE.
5045  */
5046 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5047                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5048                                                                       Register rscratch, AddressLiteral double_sign_flip,
5049                                                                       int vec_enc) {
5050   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5051 
5052   Label done;
5053   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5054   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
5055   kortestwl(ktmp1, ktmp1);
5056   jccb(Assembler::equal, done);
5057 
5058   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5059   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5060   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5061 
5062   kxorwl(ktmp1, ktmp1, ktmp2);
5063   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5064   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5065   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5066   bind(done);
5067 }
5068 
5069 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5070                                                              XMMRegister xtmp, int index, int vec_enc) {
5071    assert(vec_enc < Assembler::AVX_512bit, "");
5072    if (vec_enc == Assembler::AVX_256bit) {
5073      vextractf128_high(xtmp, src);
5074      vshufps(dst, src, xtmp, index, vec_enc);
5075    } else {
5076      vshufps(dst, src, zero, index, vec_enc);
5077    }
5078 }
5079 
5080 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5081                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5082                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
5083   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5084 
5085   Label done;
5086   // Compare the destination lanes with float_sign_flip
5087   // value to get mask for all special values.
5088   movdqu(xtmp1, float_sign_flip, rscratch);
5089   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5090   ptest(xtmp2, xtmp2);
5091   jccb(Assembler::equal, done);
5092 
5093   // Flip float_sign_flip to get max integer value.
5094   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5095   pxor(xtmp1, xtmp4);
5096 
5097   // Set detination lanes corresponding to unordered source lanes as zero.
5098   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5099   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5100 
5101   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5102   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5103   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5104 
5105   // Recompute the mask for remaining special value.
5106   pxor(xtmp2, xtmp3);
5107   // Extract mask corresponding to non-negative source lanes.
5108   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5109 
5110   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5111   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5112   pand(xtmp3, xtmp2);
5113 
5114   // Replace destination lanes holding special value(0x80000000) with max int
5115   // if corresponding source lane holds a +ve value.
5116   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5117   bind(done);
5118 }
5119 
5120 
5121 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5122                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5123   switch(to_elem_bt) {
5124     case T_SHORT:
5125       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5126       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5127       vpackusdw(dst, dst, zero, vec_enc);
5128       if (vec_enc == Assembler::AVX_256bit) {
5129         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5130       }
5131       break;
5132     case  T_BYTE:
5133       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5134       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5135       vpackusdw(dst, dst, zero, vec_enc);
5136       if (vec_enc == Assembler::AVX_256bit) {
5137         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5138       }
5139       vpackuswb(dst, dst, zero, vec_enc);
5140       break;
5141     default: assert(false, "Unexpected basic type for target of vector cast int to subword: %s", type2name(to_elem_bt));
5142   }
5143 }
5144 
5145 /*
5146  * Algorithm for vector D2L and F2I conversions (AVX 10.2 unsupported):-
5147  * a) Perform vector D2L/F2I cast.
5148  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5149  *    It signifies that source value could be any of the special floating point
5150  *    values(NaN,-Inf,Inf,Max,-Min).
5151  * c) Set destination to zero if source is NaN value.
5152  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5153  */
5154 
5155 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5156                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5157                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5158   int to_elem_sz = type2aelembytes(to_elem_bt);
5159   assert(to_elem_sz <= 4, "");
5160   vcvttps2dq(dst, src, vec_enc);
5161   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5162   if (to_elem_sz < 4) {
5163     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5164     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5165   }
5166 }
5167 
5168 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5169                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5170                                             Register rscratch, int vec_enc) {
5171   int to_elem_sz = type2aelembytes(to_elem_bt);
5172   assert(to_elem_sz <= 4, "");
5173   vcvttps2dq(dst, src, vec_enc);
5174   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5175   switch(to_elem_bt) {
5176     case T_INT:
5177       break;
5178     case T_SHORT:
5179       evpmovdw(dst, dst, vec_enc);
5180       break;
5181     case T_BYTE:
5182       evpmovdb(dst, dst, vec_enc);
5183       break;
5184     default: assert(false, "Unexpected basic type for target of vector castF2X EVEX: %s", type2name(to_elem_bt));
5185   }
5186 }
5187 
5188 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5189                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5190                                             Register rscratch, int vec_enc) {
5191   evcvttps2qq(dst, src, vec_enc);
5192   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5193 }
5194 
5195 // Handling for downcasting from double to integer or sub-word types on AVX2.
5196 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5197                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5198                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5199   int to_elem_sz = type2aelembytes(to_elem_bt);
5200   assert(to_elem_sz < 8, "");
5201   vcvttpd2dq(dst, src, vec_enc);
5202   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5203                                               float_sign_flip, vec_enc);
5204   if (to_elem_sz < 4) {
5205     // xtmp4 holds all zero lanes.
5206     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5207   }
5208 }
5209 
5210 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5211                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5212                                             KRegister ktmp2, AddressLiteral sign_flip,
5213                                             Register rscratch, int vec_enc) {
5214   if (VM_Version::supports_avx512dq()) {
5215     evcvttpd2qq(dst, src, vec_enc);
5216     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5217     switch(to_elem_bt) {
5218       case T_LONG:
5219         break;
5220       case T_INT:
5221         evpmovsqd(dst, dst, vec_enc);
5222         break;
5223       case T_SHORT:
5224         evpmovsqd(dst, dst, vec_enc);
5225         evpmovdw(dst, dst, vec_enc);
5226         break;
5227       case T_BYTE:
5228         evpmovsqd(dst, dst, vec_enc);
5229         evpmovdb(dst, dst, vec_enc);
5230         break;
5231       default: assert(false, "Unexpected basic type for target of vector castD2X AVX512DQ EVEX: %s", type2name(to_elem_bt));
5232     }
5233   } else {
5234     assert(type2aelembytes(to_elem_bt) <= 4, "");
5235     vcvttpd2dq(dst, src, vec_enc);
5236     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5237     switch(to_elem_bt) {
5238       case T_INT:
5239         break;
5240       case T_SHORT:
5241         evpmovdw(dst, dst, vec_enc);
5242         break;
5243       case T_BYTE:
5244         evpmovdb(dst, dst, vec_enc);
5245         break;
5246       default: assert(false, "Unexpected basic type for target of vector castD2X EVEX: %s", type2name(to_elem_bt));
5247     }
5248   }
5249 }
5250 
5251 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5252   switch(to_elem_bt) {
5253     case T_LONG:
5254       evcvttps2qqs(dst, src, vec_enc);
5255       break;
5256     case T_INT:
5257       evcvttps2dqs(dst, src, vec_enc);
5258       break;
5259     case T_SHORT:
5260       evcvttps2dqs(dst, src, vec_enc);
5261       evpmovdw(dst, dst, vec_enc);
5262       break;
5263     case T_BYTE:
5264       evcvttps2dqs(dst, src, vec_enc);
5265       evpmovdb(dst, dst, vec_enc);
5266       break;
5267     default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (reg src): %s", type2name(to_elem_bt));
5268   }
5269 }
5270 
5271 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5272   switch(to_elem_bt) {
5273     case T_LONG:
5274       evcvttps2qqs(dst, src, vec_enc);
5275       break;
5276     case T_INT:
5277       evcvttps2dqs(dst, src, vec_enc);
5278       break;
5279     case T_SHORT:
5280       evcvttps2dqs(dst, src, vec_enc);
5281       evpmovdw(dst, dst, vec_enc);
5282       break;
5283     case T_BYTE:
5284       evcvttps2dqs(dst, src, vec_enc);
5285       evpmovdb(dst, dst, vec_enc);
5286       break;
5287     default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (mem src): %s", type2name(to_elem_bt));
5288   }
5289 }
5290 
5291 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5292   switch(to_elem_bt) {
5293     case T_LONG:
5294       evcvttpd2qqs(dst, src, vec_enc);
5295       break;
5296     case T_INT:
5297       evcvttpd2dqs(dst, src, vec_enc);
5298       break;
5299     case T_SHORT:
5300       evcvttpd2dqs(dst, src, vec_enc);
5301       evpmovdw(dst, dst, vec_enc);
5302       break;
5303     case T_BYTE:
5304       evcvttpd2dqs(dst, src, vec_enc);
5305       evpmovdb(dst, dst, vec_enc);
5306       break;
5307     default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (reg src): %s", type2name(to_elem_bt));
5308   }
5309 }
5310 
5311 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5312   switch(to_elem_bt) {
5313     case T_LONG:
5314       evcvttpd2qqs(dst, src, vec_enc);
5315       break;
5316     case T_INT:
5317       evcvttpd2dqs(dst, src, vec_enc);
5318       break;
5319     case T_SHORT:
5320       evcvttpd2dqs(dst, src, vec_enc);
5321       evpmovdw(dst, dst, vec_enc);
5322       break;
5323     case T_BYTE:
5324       evcvttpd2dqs(dst, src, vec_enc);
5325       evpmovdb(dst, dst, vec_enc);
5326       break;
5327     default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (mem src): %s", type2name(to_elem_bt));
5328   }
5329 }
5330 
5331 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5332                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5333                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5334   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5335   // and re-instantiate original MXCSR.RC mode after that.
5336   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5337 
5338   mov64(tmp, julong_cast(0.5L));
5339   evpbroadcastq(xtmp1, tmp, vec_enc);
5340   vaddpd(xtmp1, src , xtmp1, vec_enc);
5341   evcvtpd2qq(dst, xtmp1, vec_enc);
5342   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5343                                                 double_sign_flip, vec_enc);;
5344 
5345   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5346 }
5347 
5348 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5349                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5350                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5351   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5352   // and re-instantiate original MXCSR.RC mode after that.
5353   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5354 
5355   movl(tmp, jint_cast(0.5));
5356   movq(xtmp1, tmp);
5357   vbroadcastss(xtmp1, xtmp1, vec_enc);
5358   vaddps(xtmp1, src , xtmp1, vec_enc);
5359   vcvtps2dq(dst, xtmp1, vec_enc);
5360   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5361                                               float_sign_flip, vec_enc);
5362 
5363   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5364 }
5365 
5366 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5367                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5368                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5369   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5370   // and re-instantiate original MXCSR.RC mode after that.
5371   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5372 
5373   movl(tmp, jint_cast(0.5));
5374   movq(xtmp1, tmp);
5375   vbroadcastss(xtmp1, xtmp1, vec_enc);
5376   vaddps(xtmp1, src , xtmp1, vec_enc);
5377   vcvtps2dq(dst, xtmp1, vec_enc);
5378   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5379 
5380   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5381 }
5382 
5383 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5384                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5385   switch (from_elem_bt) {
5386     case T_BYTE:
5387       switch (to_elem_bt) {
5388         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5389         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5390         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5391         default: ShouldNotReachHere();
5392       }
5393       break;
5394     case T_SHORT:
5395       switch (to_elem_bt) {
5396         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5397         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5398         default: ShouldNotReachHere();
5399       }
5400       break;
5401     case T_INT:
5402       assert(to_elem_bt == T_LONG, "");
5403       vpmovzxdq(dst, src, vlen_enc);
5404       break;
5405     default:
5406       ShouldNotReachHere();
5407   }
5408 }
5409 
5410 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5411                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5412   switch (from_elem_bt) {
5413     case T_BYTE:
5414       switch (to_elem_bt) {
5415         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5416         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5417         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5418         default: ShouldNotReachHere();
5419       }
5420       break;
5421     case T_SHORT:
5422       switch (to_elem_bt) {
5423         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5424         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5425         default: ShouldNotReachHere();
5426       }
5427       break;
5428     case T_INT:
5429       assert(to_elem_bt == T_LONG, "");
5430       vpmovsxdq(dst, src, vlen_enc);
5431       break;
5432     default:
5433       ShouldNotReachHere();
5434   }
5435 }
5436 
5437 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5438                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5439   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5440   assert(vlen_enc != AVX_512bit, "");
5441 
5442   int dst_bt_size = type2aelembytes(dst_bt);
5443   int src_bt_size = type2aelembytes(src_bt);
5444   if (dst_bt_size > src_bt_size) {
5445     switch (dst_bt_size / src_bt_size) {
5446       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5447       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5448       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5449       default: ShouldNotReachHere();
5450     }
5451   } else {
5452     assert(dst_bt_size < src_bt_size, "");
5453     switch (src_bt_size / dst_bt_size) {
5454       case 2: {
5455         if (vlen_enc == AVX_128bit) {
5456           vpacksswb(dst, src, src, vlen_enc);
5457         } else {
5458           vpacksswb(dst, src, src, vlen_enc);
5459           vpermq(dst, dst, 0x08, vlen_enc);
5460         }
5461         break;
5462       }
5463       case 4: {
5464         if (vlen_enc == AVX_128bit) {
5465           vpackssdw(dst, src, src, vlen_enc);
5466           vpacksswb(dst, dst, dst, vlen_enc);
5467         } else {
5468           vpackssdw(dst, src, src, vlen_enc);
5469           vpermq(dst, dst, 0x08, vlen_enc);
5470           vpacksswb(dst, dst, dst, AVX_128bit);
5471         }
5472         break;
5473       }
5474       case 8: {
5475         if (vlen_enc == AVX_128bit) {
5476           vpshufd(dst, src, 0x08, vlen_enc);
5477           vpackssdw(dst, dst, dst, vlen_enc);
5478           vpacksswb(dst, dst, dst, vlen_enc);
5479         } else {
5480           vpshufd(dst, src, 0x08, vlen_enc);
5481           vpermq(dst, dst, 0x08, vlen_enc);
5482           vpackssdw(dst, dst, dst, AVX_128bit);
5483           vpacksswb(dst, dst, dst, AVX_128bit);
5484         }
5485         break;
5486       }
5487       default: ShouldNotReachHere();
5488     }
5489   }
5490 }
5491 
5492 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5493                                    bool merge, BasicType bt, int vlen_enc) {
5494   if (bt == T_INT) {
5495     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5496   } else {
5497     assert(bt == T_LONG, "");
5498     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5499   }
5500 }
5501 
5502 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5503                                    bool merge, BasicType bt, int vlen_enc) {
5504   if (bt == T_INT) {
5505     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5506   } else {
5507     assert(bt == T_LONG, "");
5508     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5509   }
5510 }
5511 
5512 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5513                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5514                                                int vec_enc) {
5515   int index = 0;
5516   int vindex = 0;
5517   mov64(rtmp1, 0x0101010101010101L);
5518   pdepq(rtmp1, src, rtmp1);
5519   if (mask_len > 8) {
5520     movq(rtmp2, src);
5521     vpxor(xtmp, xtmp, xtmp, vec_enc);
5522     movq(xtmp, rtmp1);
5523   }
5524   movq(dst, rtmp1);
5525 
5526   mask_len -= 8;
5527   while (mask_len > 0) {
5528     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5529     index++;
5530     if ((index % 2) == 0) {
5531       pxor(xtmp, xtmp);
5532     }
5533     mov64(rtmp1, 0x0101010101010101L);
5534     shrq(rtmp2, 8);
5535     pdepq(rtmp1, rtmp2, rtmp1);
5536     pinsrq(xtmp, rtmp1, index % 2);
5537     vindex = index / 2;
5538     if (vindex) {
5539       // Write entire 16 byte vector when both 64 bit
5540       // lanes are update to save redundant instructions.
5541       if (index % 2) {
5542         vinsertf128(dst, dst, xtmp, vindex);
5543       }
5544     } else {
5545       vmovdqu(dst, xtmp);
5546     }
5547     mask_len -= 8;
5548   }
5549 }
5550 
5551 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5552   switch(opc) {
5553     case Op_VectorMaskTrueCount:
5554       popcntq(dst, tmp);
5555       break;
5556     case Op_VectorMaskLastTrue:
5557       if (VM_Version::supports_lzcnt()) {
5558         lzcntq(tmp, tmp);
5559         movl(dst, 63);
5560         subl(dst, tmp);
5561       } else {
5562         movl(dst, -1);
5563         bsrq(tmp, tmp);
5564         cmov32(Assembler::notZero, dst, tmp);
5565       }
5566       break;
5567     case Op_VectorMaskFirstTrue:
5568       if (UseCountTrailingZerosInstruction) {
5569         if (masklen < 32) {
5570           orl(tmp, 1 << masklen);
5571           tzcntl(dst, tmp);
5572         } else if (masklen == 32) {
5573           tzcntl(dst, tmp);
5574         } else {
5575           assert(masklen == 64, "");
5576           tzcntq(dst, tmp);
5577         }
5578       } else {
5579         if (masklen < 32) {
5580           orl(tmp, 1 << masklen);
5581           bsfl(dst, tmp);
5582         } else {
5583           assert(masklen == 32 || masklen == 64, "");
5584           movl(dst, masklen);
5585           if (masklen == 32)  {
5586             bsfl(tmp, tmp);
5587           } else {
5588             bsfq(tmp, tmp);
5589           }
5590           cmov32(Assembler::notZero, dst, tmp);
5591         }
5592       }
5593       break;
5594     case Op_VectorMaskToLong:
5595       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5596       break;
5597     default: assert(false, "Unhandled mask operation");
5598   }
5599 }
5600 
5601 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5602                                               int masklen, int masksize, int vec_enc) {
5603   assert(VM_Version::supports_popcnt(), "");
5604 
5605   if(VM_Version::supports_avx512bw()) {
5606     kmovql(tmp, mask);
5607   } else {
5608     assert(masklen <= 16, "");
5609     kmovwl(tmp, mask);
5610   }
5611 
5612   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5613   // operations needs to be clipped.
5614   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5615     andq(tmp, (1 << masklen) - 1);
5616   }
5617 
5618   vector_mask_operation_helper(opc, dst, tmp, masklen);
5619 }
5620 
5621 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5622                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5623   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5624          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5625   assert(VM_Version::supports_popcnt(), "");
5626 
5627   bool need_clip = false;
5628   switch(bt) {
5629     case T_BOOLEAN:
5630       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5631       vpxor(xtmp, xtmp, xtmp, vec_enc);
5632       vpsubb(xtmp, xtmp, mask, vec_enc);
5633       vpmovmskb(tmp, xtmp, vec_enc);
5634       need_clip = masklen < 16;
5635       break;
5636     case T_BYTE:
5637       vpmovmskb(tmp, mask, vec_enc);
5638       need_clip = masklen < 16;
5639       break;
5640     case T_SHORT:
5641       vpacksswb(xtmp, mask, mask, vec_enc);
5642       if (masklen >= 16) {
5643         vpermpd(xtmp, xtmp, 8, vec_enc);
5644       }
5645       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5646       need_clip = masklen < 16;
5647       break;
5648     case T_INT:
5649     case T_FLOAT:
5650       vmovmskps(tmp, mask, vec_enc);
5651       need_clip = masklen < 4;
5652       break;
5653     case T_LONG:
5654     case T_DOUBLE:
5655       vmovmskpd(tmp, mask, vec_enc);
5656       need_clip = masklen < 2;
5657       break;
5658     default: assert(false, "Unhandled type, %s", type2name(bt));
5659   }
5660 
5661   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5662   // operations needs to be clipped.
5663   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5664     // need_clip implies masklen < 32
5665     andq(tmp, (1 << masklen) - 1);
5666   }
5667 
5668   vector_mask_operation_helper(opc, dst, tmp, masklen);
5669 }
5670 
5671 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5672                                              Register rtmp2, int mask_len) {
5673   kmov(rtmp1, src);
5674   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5675   mov64(rtmp2, -1L);
5676   pextq(rtmp2, rtmp2, rtmp1);
5677   kmov(dst, rtmp2);
5678 }
5679 
5680 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5681                                                     XMMRegister mask, Register rtmp, Register rscratch,
5682                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5683                                                     int vec_enc) {
5684   assert(type2aelembytes(bt) >= 4, "");
5685   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5686   address compress_perm_table = nullptr;
5687   address expand_perm_table = nullptr;
5688   if (type2aelembytes(bt) == 8) {
5689     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5690     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5691     vmovmskpd(rtmp, mask, vec_enc);
5692   } else {
5693     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5694     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5695     vmovmskps(rtmp, mask, vec_enc);
5696   }
5697   shlq(rtmp, 5); // for 32 byte permute row.
5698   if (opcode == Op_CompressV) {
5699     lea(rscratch, ExternalAddress(compress_perm_table));
5700   } else {
5701     lea(rscratch, ExternalAddress(expand_perm_table));
5702   }
5703   addptr(rtmp, rscratch);
5704   vmovdqu(permv, Address(rtmp));
5705   vpermps(dst, permv, src, Assembler::AVX_256bit);
5706   vpxor(xtmp, xtmp, xtmp, vec_enc);
5707   // Blend the result with zero vector using permute mask, each column entry
5708   // in a permute table row contains either a valid permute index or a -1 (default)
5709   // value, this can potentially be used as a blending mask after
5710   // compressing/expanding the source vector lanes.
5711   vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv);
5712 }
5713 
5714 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5715                                                bool merge, BasicType bt, int vec_enc) {
5716   if (opcode == Op_CompressV) {
5717     switch(bt) {
5718     case T_BYTE:
5719       evpcompressb(dst, mask, src, merge, vec_enc);
5720       break;
5721     case T_CHAR:
5722     case T_SHORT:
5723       evpcompressw(dst, mask, src, merge, vec_enc);
5724       break;
5725     case T_INT:
5726       evpcompressd(dst, mask, src, merge, vec_enc);
5727       break;
5728     case T_FLOAT:
5729       evcompressps(dst, mask, src, merge, vec_enc);
5730       break;
5731     case T_LONG:
5732       evpcompressq(dst, mask, src, merge, vec_enc);
5733       break;
5734     case T_DOUBLE:
5735       evcompresspd(dst, mask, src, merge, vec_enc);
5736       break;
5737     default:
5738       fatal("Unsupported type %s", type2name(bt));
5739       break;
5740     }
5741   } else {
5742     assert(opcode == Op_ExpandV, "");
5743     switch(bt) {
5744     case T_BYTE:
5745       evpexpandb(dst, mask, src, merge, vec_enc);
5746       break;
5747     case T_CHAR:
5748     case T_SHORT:
5749       evpexpandw(dst, mask, src, merge, vec_enc);
5750       break;
5751     case T_INT:
5752       evpexpandd(dst, mask, src, merge, vec_enc);
5753       break;
5754     case T_FLOAT:
5755       evexpandps(dst, mask, src, merge, vec_enc);
5756       break;
5757     case T_LONG:
5758       evpexpandq(dst, mask, src, merge, vec_enc);
5759       break;
5760     case T_DOUBLE:
5761       evexpandpd(dst, mask, src, merge, vec_enc);
5762       break;
5763     default:
5764       fatal("Unsupported type %s", type2name(bt));
5765       break;
5766     }
5767   }
5768 }
5769 
5770 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5771                                            KRegister ktmp1, int vec_enc) {
5772   if (opcode == Op_SignumVD) {
5773     vsubpd(dst, zero, one, vec_enc);
5774     // if src < 0 ? -1 : 1
5775     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5776     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5777     // if src == NaN, -0.0 or 0.0 return src.
5778     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5779     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5780   } else {
5781     assert(opcode == Op_SignumVF, "");
5782     vsubps(dst, zero, one, vec_enc);
5783     // if src < 0 ? -1 : 1
5784     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5785     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5786     // if src == NaN, -0.0 or 0.0 return src.
5787     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5788     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5789   }
5790 }
5791 
5792 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5793                                           XMMRegister xtmp1, int vec_enc) {
5794   if (opcode == Op_SignumVD) {
5795     vsubpd(dst, zero, one, vec_enc);
5796     // if src < 0 ? -1 : 1
5797     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5798     // if src == NaN, -0.0 or 0.0 return src.
5799     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5800     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5801   } else {
5802     assert(opcode == Op_SignumVF, "");
5803     vsubps(dst, zero, one, vec_enc);
5804     // if src < 0 ? -1 : 1
5805     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5806     // if src == NaN, -0.0 or 0.0 return src.
5807     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5808     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5809   }
5810 }
5811 
5812 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5813   if (VM_Version::supports_avx512bw()) {
5814     if (mask_len > 32) {
5815       kmovql(dst, src);
5816     } else {
5817       kmovdl(dst, src);
5818       if (mask_len != 32) {
5819         kshiftrdl(dst, dst, 32 - mask_len);
5820       }
5821     }
5822   } else {
5823     assert(mask_len <= 16, "");
5824     kmovwl(dst, src);
5825     if (mask_len != 16) {
5826       kshiftrwl(dst, dst, 16 - mask_len);
5827     }
5828   }
5829 }
5830 
5831 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5832   int lane_size = type2aelembytes(bt);
5833   if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5834       (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) {
5835     movptr(rtmp, imm32);
5836     switch(lane_size) {
5837       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5838       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5839       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5840       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5841       fatal("Unsupported lane size %d", lane_size);
5842       break;
5843     }
5844   } else {
5845     movptr(rtmp, imm32);
5846     movq(dst, rtmp);
5847     switch(lane_size) {
5848       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5849       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5850       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5851       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5852       fatal("Unsupported lane size %d", lane_size);
5853       break;
5854     }
5855   }
5856 }
5857 
5858 //
5859 // Following is lookup table based popcount computation algorithm:-
5860 //       Index   Bit set count
5861 //     [ 0000 ->   0,
5862 //       0001 ->   1,
5863 //       0010 ->   1,
5864 //       0011 ->   2,
5865 //       0100 ->   1,
5866 //       0101 ->   2,
5867 //       0110 ->   2,
5868 //       0111 ->   3,
5869 //       1000 ->   1,
5870 //       1001 ->   2,
5871 //       1010 ->   3,
5872 //       1011 ->   3,
5873 //       1100 ->   2,
5874 //       1101 ->   3,
5875 //       1111 ->   4 ]
5876 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5877 //     shuffle indices for lookup table access.
5878 //  b. Right shift each byte of vector lane by 4 positions.
5879 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5880 //     shuffle indices for lookup table access.
5881 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5882 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5883 //     count of all the bytes of a quadword.
5884 //  f. Perform step e. for upper 128bit vector lane.
5885 //  g. Pack the bitset count of quadwords back to double word.
5886 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5887 
5888 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5889                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5890   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5891   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5892   vpsrlw(dst, src, 4, vec_enc);
5893   vpand(dst, dst, xtmp1, vec_enc);
5894   vpand(xtmp1, src, xtmp1, vec_enc);
5895   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5896   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5897   vpshufb(dst, xtmp2, dst, vec_enc);
5898   vpaddb(dst, dst, xtmp1, vec_enc);
5899 }
5900 
5901 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5902                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5903   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5904   // Following code is as per steps e,f,g and h of above algorithm.
5905   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5906   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5907   vpsadbw(dst, dst, xtmp2, vec_enc);
5908   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5909   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5910   vpackuswb(dst, xtmp1, dst, vec_enc);
5911 }
5912 
5913 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5914                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5915   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5916   // Add the popcount of upper and lower bytes of word.
5917   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5918   vpsrlw(dst, xtmp1, 8, vec_enc);
5919   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5920   vpaddw(dst, dst, xtmp1, vec_enc);
5921 }
5922 
5923 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5924                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5925   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5926   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5927   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5928 }
5929 
5930 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5931                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5932   switch(bt) {
5933     case T_LONG:
5934       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5935       break;
5936     case T_INT:
5937       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5938       break;
5939     case T_CHAR:
5940     case T_SHORT:
5941       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5942       break;
5943     case T_BYTE:
5944     case T_BOOLEAN:
5945       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5946       break;
5947     default:
5948       fatal("Unsupported type %s", type2name(bt));
5949       break;
5950   }
5951 }
5952 
5953 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5954                                                       KRegister mask, bool merge, int vec_enc) {
5955   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5956   switch(bt) {
5957     case T_LONG:
5958       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5959       evpopcntq(dst, mask, src, merge, vec_enc);
5960       break;
5961     case T_INT:
5962       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5963       evpopcntd(dst, mask, src, merge, vec_enc);
5964       break;
5965     case T_CHAR:
5966     case T_SHORT:
5967       assert(VM_Version::supports_avx512_bitalg(), "");
5968       evpopcntw(dst, mask, src, merge, vec_enc);
5969       break;
5970     case T_BYTE:
5971     case T_BOOLEAN:
5972       assert(VM_Version::supports_avx512_bitalg(), "");
5973       evpopcntb(dst, mask, src, merge, vec_enc);
5974       break;
5975     default:
5976       fatal("Unsupported type %s", type2name(bt));
5977       break;
5978   }
5979 }
5980 
5981 // Bit reversal algorithm first reverses the bits of each byte followed by
5982 // a byte level reversal for multi-byte primitive types (short/int/long).
5983 // Algorithm performs a lookup table access to get reverse bit sequence
5984 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5985 // is obtained by swapping the reverse bit sequences of upper and lower
5986 // nibble of a byte.
5987 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5988                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5989   if (VM_Version::supports_avx512vlbw()) {
5990 
5991     // Get the reverse bit sequence of lower nibble of each byte.
5992     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5993     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5994     evpandq(dst, xtmp2, src, vec_enc);
5995     vpshufb(dst, xtmp1, dst, vec_enc);
5996     vpsllq(dst, dst, 4, vec_enc);
5997 
5998     // Get the reverse bit sequence of upper nibble of each byte.
5999     vpandn(xtmp2, xtmp2, src, vec_enc);
6000     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6001     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6002 
6003     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6004     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6005     evporq(xtmp2, dst, xtmp2, vec_enc);
6006     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6007 
6008   } else if(vec_enc == Assembler::AVX_512bit) {
6009     // Shift based bit reversal.
6010     assert(bt == T_LONG || bt == T_INT, "");
6011 
6012     // Swap lower and upper nibble of each byte.
6013     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
6014 
6015     // Swap two least and most significant bits of each nibble.
6016     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
6017 
6018     // Swap adjacent pair of bits.
6019     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6020     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
6021 
6022     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6023     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
6024   } else {
6025     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
6026     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6027 
6028     // Get the reverse bit sequence of lower nibble of each byte.
6029     vpand(dst, xtmp2, src, vec_enc);
6030     vpshufb(dst, xtmp1, dst, vec_enc);
6031     vpsllq(dst, dst, 4, vec_enc);
6032 
6033     // Get the reverse bit sequence of upper nibble of each byte.
6034     vpandn(xtmp2, xtmp2, src, vec_enc);
6035     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6036     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6037 
6038     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6039     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6040     vpor(xtmp2, dst, xtmp2, vec_enc);
6041     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6042   }
6043 }
6044 
6045 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
6046                                                 XMMRegister xtmp, Register rscratch) {
6047   assert(VM_Version::supports_gfni(), "");
6048   assert(rscratch != noreg || always_reachable(mask), "missing");
6049 
6050   // Galois field instruction based bit reversal based on following algorithm.
6051   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6052   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
6053   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
6054   vector_reverse_byte(bt, dst, xtmp, vec_enc);
6055 }
6056 
6057 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
6058                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
6059   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
6060   evpandq(dst, xtmp1, src, vec_enc);
6061   vpsllq(dst, dst, nbits, vec_enc);
6062   vpandn(xtmp1, xtmp1, src, vec_enc);
6063   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
6064   evporq(dst, dst, xtmp1, vec_enc);
6065 }
6066 
6067 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6068                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
6069   // Shift based bit reversal.
6070   assert(VM_Version::supports_evex(), "");
6071   switch(bt) {
6072     case T_LONG:
6073       // Swap upper and lower double word of each quad word.
6074       evprorq(xtmp1, k0, src, 32, true, vec_enc);
6075       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
6076       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6077       break;
6078     case T_INT:
6079       // Swap upper and lower word of each double word.
6080       evprord(xtmp1, k0, src, 16, true, vec_enc);
6081       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6082       break;
6083     case T_CHAR:
6084     case T_SHORT:
6085       // Swap upper and lower byte of each word.
6086       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6087       break;
6088     case T_BYTE:
6089       evmovdquq(dst, k0, src, true, vec_enc);
6090       break;
6091     default:
6092       fatal("Unsupported type %s", type2name(bt));
6093       break;
6094   }
6095 }
6096 
6097 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6098   if (bt == T_BYTE) {
6099     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6100       evmovdquq(dst, k0, src, true, vec_enc);
6101     } else {
6102       vmovdqu(dst, src);
6103     }
6104     return;
6105   }
6106   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6107   // pre-computed shuffle indices.
6108   switch(bt) {
6109     case T_LONG:
6110       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6111       break;
6112     case T_INT:
6113       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6114       break;
6115     case T_CHAR:
6116     case T_SHORT:
6117       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6118       break;
6119     default:
6120       fatal("Unsupported type %s", type2name(bt));
6121       break;
6122   }
6123   vpshufb(dst, src, dst, vec_enc);
6124 }
6125 
6126 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6127                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6128                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6129   assert(is_integral_type(bt), "");
6130   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6131   assert(VM_Version::supports_avx512cd(), "");
6132   switch(bt) {
6133     case T_LONG:
6134       evplzcntq(dst, ktmp, src, merge, vec_enc);
6135       break;
6136     case T_INT:
6137       evplzcntd(dst, ktmp, src, merge, vec_enc);
6138       break;
6139     case T_SHORT:
6140       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6141       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6142       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6143       vpunpckhwd(dst, xtmp1, src, vec_enc);
6144       evplzcntd(dst, ktmp, dst, merge, vec_enc);
6145       vpackusdw(dst, xtmp2, dst, vec_enc);
6146       break;
6147     case T_BYTE:
6148       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6149       // accessing the lookup table.
6150       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6151       // accessing the lookup table.
6152       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6153       assert(VM_Version::supports_avx512bw(), "");
6154       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6155       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6156       vpand(xtmp2, dst, src, vec_enc);
6157       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6158       vpsrlw(xtmp3, src, 4, vec_enc);
6159       vpand(xtmp3, dst, xtmp3, vec_enc);
6160       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6161       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6162       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6163       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6164       break;
6165     default:
6166       fatal("Unsupported type %s", type2name(bt));
6167       break;
6168   }
6169 }
6170 
6171 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6172                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6173   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6174   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6175   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6176   // accessing the lookup table.
6177   vpand(dst, xtmp2, src, vec_enc);
6178   vpshufb(dst, xtmp1, dst, vec_enc);
6179   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6180   // accessing the lookup table.
6181   vpsrlw(xtmp3, src, 4, vec_enc);
6182   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6183   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6184   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6185   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6186   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6187   vpaddb(dst, dst, xtmp2, vec_enc);
6188   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6189 }
6190 
6191 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6192                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6193   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6194   // Add zero counts of lower byte and upper byte of a word if
6195   // upper byte holds a zero value.
6196   vpsrlw(xtmp3, src, 8, vec_enc);
6197   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6198   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6199   vpsllw(xtmp2, dst, 8, vec_enc);
6200   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6201   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6202   vpsrlw(dst, dst, 8, vec_enc);
6203 }
6204 
6205 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6206                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6207   // By converting the integer to a float, we can obtain the number of leading zeros based on the exponent of the float.
6208   // As the float exponent contains a bias of 127 for nonzero values, the bias must be removed before interpreting the
6209   // exponent as the leading zero count.
6210 
6211   // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher
6212   // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit
6213   // contributes to the leading number of zeros.
6214   vpsrld(dst, src, 1, vec_enc);
6215   vpandn(dst, dst, src, vec_enc);
6216 
6217   vcvtdq2ps(dst, dst, vec_enc);
6218 
6219   // By comparing the register to itself, all the bits in the destination are set.
6220   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6221 
6222   // Move the biased exponent to the low end of the lane and mask with 0xFF to discard the sign bit.
6223   vpsrld(xtmp2, xtmp1, 24, vec_enc);
6224   vpsrld(dst, dst, 23, vec_enc);
6225   vpand(dst, xtmp2, dst, vec_enc);
6226 
6227   // Subtract 127 from the exponent, which removes the bias from the exponent.
6228   vpsrld(xtmp2, xtmp1, 25, vec_enc);
6229   vpsubd(dst, dst, xtmp2, vec_enc);
6230 
6231   vpsrld(xtmp2, xtmp1, 27, vec_enc);
6232 
6233   // If the original value is 0 the exponent would not have bias, so the subtraction creates a negative number. If this
6234   // is found in any of the lanes, replace the lane with -1 from xtmp1.
6235   vblendvps(dst, dst, xtmp1, dst, vec_enc, true, xtmp3);
6236 
6237   // If the original value is negative, replace the lane with 31.
6238   vblendvps(dst, dst, xtmp2, src, vec_enc, true, xtmp3);
6239 
6240   // Subtract the exponent from 31, giving the final result. For 0, the result is 32 as the exponent was replaced with -1,
6241   // and for negative numbers the result is 0 as the exponent was replaced with 31.
6242   vpsubd(dst, xtmp2, dst, vec_enc);
6243 }
6244 
6245 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6246                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6247   // Find the leading zeros of the top and bottom halves of the long individually.
6248   vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6249 
6250   // Move the top half result to the bottom half of xtmp1, setting the top half to 0.
6251   vpsrlq(xtmp1, dst, 32, vec_enc);
6252   // By moving the top half result to the right by 6 bits, if the top half was empty (i.e. 32 is returned) the result bit will
6253   // be in the most significant position of the bottom half.
6254   vpsrlq(xtmp2, dst, 6, vec_enc);
6255 
6256   // In the bottom half, add the top half and bottom half results.
6257   vpaddq(dst, xtmp1, dst, vec_enc);
6258 
6259   // For the bottom half, choose between the values using the most significant bit of xtmp2.
6260   // If the MSB is set, then bottom+top in dst is the resulting value. If the top half is less than 32 xtmp1 is chosen,
6261   // which contains only the top half result.
6262   // In the top half the MSB is always zero, so the value in xtmp1 is always chosen. This value is always 0, which clears
6263   // the lane as required.
6264   vblendvps(dst, xtmp1, dst, xtmp2, vec_enc, true, xtmp3);
6265 }
6266 
6267 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6268                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6269                                                        Register rtmp, int vec_enc) {
6270   assert(is_integral_type(bt), "unexpected type");
6271   assert(vec_enc < Assembler::AVX_512bit, "");
6272   switch(bt) {
6273     case T_LONG:
6274       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6275       break;
6276     case T_INT:
6277       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6278       break;
6279     case T_SHORT:
6280       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6281       break;
6282     case T_BYTE:
6283       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6284       break;
6285     default:
6286       fatal("Unsupported type %s", type2name(bt));
6287       break;
6288   }
6289 }
6290 
6291 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6292   switch(bt) {
6293     case T_BYTE:
6294       vpsubb(dst, src1, src2, vec_enc);
6295       break;
6296     case T_SHORT:
6297       vpsubw(dst, src1, src2, vec_enc);
6298       break;
6299     case T_INT:
6300       vpsubd(dst, src1, src2, vec_enc);
6301       break;
6302     case T_LONG:
6303       vpsubq(dst, src1, src2, vec_enc);
6304       break;
6305     default:
6306       fatal("Unsupported type %s", type2name(bt));
6307       break;
6308   }
6309 }
6310 
6311 // Trailing zero count computation is based on leading zero count operation as per
6312 // following equation. All AVX3 targets support AVX512CD feature which offers
6313 // direct vector instruction to compute leading zero count.
6314 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6315 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6316                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6317                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6318   assert(is_integral_type(bt), "");
6319   // xtmp = -1
6320   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6321   // xtmp = xtmp + src
6322   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6323   // xtmp = xtmp & ~src
6324   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6325   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6326   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6327   vpsub(bt, dst, xtmp4, dst, vec_enc);
6328 }
6329 
6330 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6331 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6332 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6333                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6334   assert(is_integral_type(bt), "");
6335   // xtmp = 0
6336   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6337   // xtmp = 0 - src
6338   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6339   // xtmp = xtmp | src
6340   vpor(xtmp3, xtmp3, src, vec_enc);
6341   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6342   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6343   vpsub(bt, dst, xtmp1, dst, vec_enc);
6344 }
6345 
6346 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6347   Label done;
6348   Label neg_divisor_fastpath;
6349   cmpl(divisor, 0);
6350   jccb(Assembler::less, neg_divisor_fastpath);
6351   xorl(rdx, rdx);
6352   divl(divisor);
6353   jmpb(done);
6354   bind(neg_divisor_fastpath);
6355   // Fastpath for divisor < 0:
6356   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6357   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6358   movl(rdx, rax);
6359   subl(rdx, divisor);
6360   if (VM_Version::supports_bmi1() && VM_Version::supports_avx()) {
6361     andnl(rax, rdx, rax);
6362   } else {
6363     notl(rdx);
6364     andl(rax, rdx);
6365   }
6366   shrl(rax, 31);
6367   bind(done);
6368 }
6369 
6370 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6371   Label done;
6372   Label neg_divisor_fastpath;
6373   cmpl(divisor, 0);
6374   jccb(Assembler::less, neg_divisor_fastpath);
6375   xorl(rdx, rdx);
6376   divl(divisor);
6377   jmpb(done);
6378   bind(neg_divisor_fastpath);
6379   // Fastpath when divisor < 0:
6380   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6381   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6382   movl(rdx, rax);
6383   subl(rax, divisor);
6384   if (VM_Version::supports_bmi1() && VM_Version::supports_avx()) {
6385     andnl(rax, rax, rdx);
6386   } else {
6387     notl(rax);
6388     andl(rax, rdx);
6389   }
6390   sarl(rax, 31);
6391   andl(rax, divisor);
6392   subl(rdx, rax);
6393   bind(done);
6394 }
6395 
6396 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6397   Label done;
6398   Label neg_divisor_fastpath;
6399 
6400   cmpl(divisor, 0);
6401   jccb(Assembler::less, neg_divisor_fastpath);
6402   xorl(rdx, rdx);
6403   divl(divisor);
6404   jmpb(done);
6405   bind(neg_divisor_fastpath);
6406   // Fastpath for divisor < 0:
6407   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6408   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6409   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6410   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6411   movl(rdx, rax);
6412   subl(rax, divisor);
6413   if (VM_Version::supports_bmi1() && VM_Version::supports_avx()) {
6414     andnl(rax, rax, rdx);
6415   } else {
6416     notl(rax);
6417     andl(rax, rdx);
6418   }
6419   movl(tmp, rax);
6420   shrl(rax, 31); // quotient
6421   sarl(tmp, 31);
6422   andl(tmp, divisor);
6423   subl(rdx, tmp); // remainder
6424   bind(done);
6425 }
6426 
6427 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6428                                  XMMRegister xtmp2, Register rtmp) {
6429   if(VM_Version::supports_gfni()) {
6430     // Galois field instruction based bit reversal based on following algorithm.
6431     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6432     mov64(rtmp, 0x8040201008040201L);
6433     movq(xtmp1, src);
6434     movq(xtmp2, rtmp);
6435     gf2p8affineqb(xtmp1, xtmp2, 0);
6436     movq(dst, xtmp1);
6437   } else {
6438     // Swap even and odd numbered bits.
6439     movl(rtmp, src);
6440     andl(rtmp, 0x55555555);
6441     shll(rtmp, 1);
6442     movl(dst, src);
6443     andl(dst, 0xAAAAAAAA);
6444     shrl(dst, 1);
6445     orl(dst, rtmp);
6446 
6447     // Swap LSB and MSB 2 bits of each nibble.
6448     movl(rtmp, dst);
6449     andl(rtmp, 0x33333333);
6450     shll(rtmp, 2);
6451     andl(dst, 0xCCCCCCCC);
6452     shrl(dst, 2);
6453     orl(dst, rtmp);
6454 
6455     // Swap LSB and MSB 4 bits of each byte.
6456     movl(rtmp, dst);
6457     andl(rtmp, 0x0F0F0F0F);
6458     shll(rtmp, 4);
6459     andl(dst, 0xF0F0F0F0);
6460     shrl(dst, 4);
6461     orl(dst, rtmp);
6462   }
6463   bswapl(dst);
6464 }
6465 
6466 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6467                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6468   if(VM_Version::supports_gfni()) {
6469     // Galois field instruction based bit reversal based on following algorithm.
6470     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6471     mov64(rtmp1, 0x8040201008040201L);
6472     movq(xtmp1, src);
6473     movq(xtmp2, rtmp1);
6474     gf2p8affineqb(xtmp1, xtmp2, 0);
6475     movq(dst, xtmp1);
6476   } else {
6477     // Swap even and odd numbered bits.
6478     movq(rtmp1, src);
6479     mov64(rtmp2, 0x5555555555555555L);
6480     andq(rtmp1, rtmp2);
6481     shlq(rtmp1, 1);
6482     movq(dst, src);
6483     notq(rtmp2);
6484     andq(dst, rtmp2);
6485     shrq(dst, 1);
6486     orq(dst, rtmp1);
6487 
6488     // Swap LSB and MSB 2 bits of each nibble.
6489     movq(rtmp1, dst);
6490     mov64(rtmp2, 0x3333333333333333L);
6491     andq(rtmp1, rtmp2);
6492     shlq(rtmp1, 2);
6493     notq(rtmp2);
6494     andq(dst, rtmp2);
6495     shrq(dst, 2);
6496     orq(dst, rtmp1);
6497 
6498     // Swap LSB and MSB 4 bits of each byte.
6499     movq(rtmp1, dst);
6500     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6501     andq(rtmp1, rtmp2);
6502     shlq(rtmp1, 4);
6503     notq(rtmp2);
6504     andq(dst, rtmp2);
6505     shrq(dst, 4);
6506     orq(dst, rtmp1);
6507   }
6508   bswapq(dst);
6509 }
6510 
6511 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6512   Label done;
6513   Label neg_divisor_fastpath;
6514   cmpq(divisor, 0);
6515   jccb(Assembler::less, neg_divisor_fastpath);
6516   xorl(rdx, rdx);
6517   divq(divisor);
6518   jmpb(done);
6519   bind(neg_divisor_fastpath);
6520   // Fastpath for divisor < 0:
6521   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6522   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6523   movq(rdx, rax);
6524   subq(rdx, divisor);
6525   if (VM_Version::supports_bmi1() && VM_Version::supports_avx()) {
6526     andnq(rax, rdx, rax);
6527   } else {
6528     notq(rdx);
6529     andq(rax, rdx);
6530   }
6531   shrq(rax, 63);
6532   bind(done);
6533 }
6534 
6535 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6536   Label done;
6537   Label neg_divisor_fastpath;
6538   cmpq(divisor, 0);
6539   jccb(Assembler::less, neg_divisor_fastpath);
6540   xorq(rdx, rdx);
6541   divq(divisor);
6542   jmp(done);
6543   bind(neg_divisor_fastpath);
6544   // Fastpath when divisor < 0:
6545   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6546   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6547   movq(rdx, rax);
6548   subq(rax, divisor);
6549   if (VM_Version::supports_bmi1() && VM_Version::supports_avx()) {
6550     andnq(rax, rax, rdx);
6551   } else {
6552     notq(rax);
6553     andq(rax, rdx);
6554   }
6555   sarq(rax, 63);
6556   andq(rax, divisor);
6557   subq(rdx, rax);
6558   bind(done);
6559 }
6560 
6561 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6562   Label done;
6563   Label neg_divisor_fastpath;
6564   cmpq(divisor, 0);
6565   jccb(Assembler::less, neg_divisor_fastpath);
6566   xorq(rdx, rdx);
6567   divq(divisor);
6568   jmp(done);
6569   bind(neg_divisor_fastpath);
6570   // Fastpath for divisor < 0:
6571   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6572   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6573   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6574   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6575   movq(rdx, rax);
6576   subq(rax, divisor);
6577   if (VM_Version::supports_bmi1() && VM_Version::supports_avx()) {
6578     andnq(rax, rax, rdx);
6579   } else {
6580     notq(rax);
6581     andq(rax, rdx);
6582   }
6583   movq(tmp, rax);
6584   shrq(rax, 63); // quotient
6585   sarq(tmp, 63);
6586   andq(tmp, divisor);
6587   subq(rdx, tmp); // remainder
6588   bind(done);
6589 }
6590 
6591 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6592                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6593                                         int vlen_enc) {
6594   assert(VM_Version::supports_avx512bw(), "");
6595   // Byte shuffles are inlane operations and indices are determined using
6596   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6597   // normalized to index range 0-15. This makes sure that all the multiples
6598   // of an index value are placed at same relative position in 128 bit
6599   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6600   // will be 16th element in their respective 128 bit lanes.
6601   movl(rtmp, 16);
6602   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6603 
6604   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6605   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6606   // original shuffle indices and move the shuffled lanes corresponding to true
6607   // mask to destination vector.
6608   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6609   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6610   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6611 
6612   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6613   // and broadcasting second 128 bit lane.
6614   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6615   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6616   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6617   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6618   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6619 
6620   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6621   // and broadcasting third 128 bit lane.
6622   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6623   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6624   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6625   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6626   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6627 
6628   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6629   // and broadcasting third 128 bit lane.
6630   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6631   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6632   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6633   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6634   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6635 }
6636 
6637 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6638                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6639   if (vlen_enc == AVX_128bit) {
6640     vpermilps(dst, src, shuffle, vlen_enc);
6641   } else if (bt == T_INT) {
6642     vpermd(dst, shuffle, src, vlen_enc);
6643   } else {
6644     assert(bt == T_FLOAT, "");
6645     vpermps(dst, shuffle, src, vlen_enc);
6646   }
6647 }
6648 
6649 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6650   switch(opcode) {
6651     case Op_AddHF: vaddsh(dst, src1, src2); break;
6652     case Op_SubHF: vsubsh(dst, src1, src2); break;
6653     case Op_MulHF: vmulsh(dst, src1, src2); break;
6654     case Op_DivHF: vdivsh(dst, src1, src2); break;
6655     default: assert(false, "%s", NodeClassNames[opcode]); break;
6656   }
6657 }
6658 
6659 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6660   switch(elem_bt) {
6661     case T_BYTE:
6662       if (ideal_opc == Op_SaturatingAddV) {
6663         vpaddsb(dst, src1, src2, vlen_enc);
6664       } else {
6665         assert(ideal_opc == Op_SaturatingSubV, "");
6666         vpsubsb(dst, src1, src2, vlen_enc);
6667       }
6668       break;
6669     case T_SHORT:
6670       if (ideal_opc == Op_SaturatingAddV) {
6671         vpaddsw(dst, src1, src2, vlen_enc);
6672       } else {
6673         assert(ideal_opc == Op_SaturatingSubV, "");
6674         vpsubsw(dst, src1, src2, vlen_enc);
6675       }
6676       break;
6677     default:
6678       fatal("Unsupported type %s", type2name(elem_bt));
6679       break;
6680   }
6681 }
6682 
6683 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6684   switch(elem_bt) {
6685     case T_BYTE:
6686       if (ideal_opc == Op_SaturatingAddV) {
6687         vpaddusb(dst, src1, src2, vlen_enc);
6688       } else {
6689         assert(ideal_opc == Op_SaturatingSubV, "");
6690         vpsubusb(dst, src1, src2, vlen_enc);
6691       }
6692       break;
6693     case T_SHORT:
6694       if (ideal_opc == Op_SaturatingAddV) {
6695         vpaddusw(dst, src1, src2, vlen_enc);
6696       } else {
6697         assert(ideal_opc == Op_SaturatingSubV, "");
6698         vpsubusw(dst, src1, src2, vlen_enc);
6699       }
6700       break;
6701     default:
6702       fatal("Unsupported type %s", type2name(elem_bt));
6703       break;
6704   }
6705 }
6706 
6707 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6708                                                               XMMRegister src2, KRegister ktmp, int vlen_enc) {
6709   // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6710   // overflow_mask = Inp1 <u Inp2
6711   evpcmpu(elem_bt, ktmp,  src2, src1, Assembler::lt, vlen_enc);
6712   // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6713   evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6714 }
6715 
6716 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6717                                                               XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6718   // Emulate unsigned comparison using signed comparison
6719   // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6720   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6721   vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6722   vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6723 
6724   vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6725 
6726   // Res = INP1 - INP2 (non-commutative and non-associative)
6727   vpsub(elem_bt, dst, src1, src2, vlen_enc);
6728   // Res = Mask ? Zero : Res
6729   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6730   vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6731 }
6732 
6733 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6734                                                                XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6735   // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6736   // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6737   // Res = Signed Add INP1, INP2
6738   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6739   // T1 = SRC1 | SRC2
6740   vpor(xtmp1, src1, src2, vlen_enc);
6741   // Max_Unsigned = -1
6742   vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6743   // Unsigned compare:  Mask = Res <u T1
6744   evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6745   // res  = Mask ? Max_Unsigned : Res
6746   evpblend(elem_bt, dst, ktmp,  dst, xtmp2, true, vlen_enc);
6747 }
6748 
6749 //
6750 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6751 // unsigned addition operation.
6752 //    overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6753 //
6754 // We empirically determined its semantic equivalence to following reduced expression
6755 //    overflow_mask =  (a + b) <u (a | b)
6756 //
6757 // and also verified it though Alive2 solver.
6758 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6759 //
6760 
6761 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6762                                                               XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6763   // Res = Signed Add INP1, INP2
6764   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6765   // Compute T1 = INP1 | INP2
6766   vpor(xtmp3, src1, src2, vlen_enc);
6767   // T1 = Minimum signed value.
6768   vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6769   // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6770   vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6771   // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6772   vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6773   // Compute overflow detection mask = Res<1> <s T1
6774   if (elem_bt == T_INT) {
6775     vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6776   } else {
6777     assert(elem_bt == T_LONG, "");
6778     vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6779   }
6780   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6781 }
6782 
6783 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6784                                       int vlen_enc, bool xtmp2_hold_M1) {
6785   if (VM_Version::supports_avx512dq()) {
6786     evpmovq2m(ktmp, src, vlen_enc);
6787   } else {
6788     assert(VM_Version::supports_evex(), "");
6789     if (!xtmp2_hold_M1) {
6790       vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6791     }
6792     evpsraq(xtmp1, src, 63, vlen_enc);
6793     evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6794   }
6795 }
6796 
6797 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6798                                       int vlen_enc, bool xtmp2_hold_M1) {
6799   if (VM_Version::supports_avx512dq()) {
6800     evpmovd2m(ktmp, src, vlen_enc);
6801   } else {
6802     assert(VM_Version::supports_evex(), "");
6803     if (!xtmp2_hold_M1) {
6804       vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6805     }
6806     vpsrad(xtmp1, src, 31, vlen_enc);
6807     Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6808   }
6809 }
6810 
6811 
6812 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6813   if (elem_bt == T_LONG) {
6814     if (VM_Version::supports_evex()) {
6815       evpsraq(dst, src, 63, vlen_enc);
6816     } else {
6817       vpsrad(dst, src, 31, vlen_enc);
6818       vpshufd(dst, dst, 0xF5, vlen_enc);
6819     }
6820   } else {
6821     assert(elem_bt == T_INT, "");
6822     vpsrad(dst, src, 31, vlen_enc);
6823   }
6824 }
6825 
6826 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6827   if (compute_allones) {
6828     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6829       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6830     } else {
6831       vpcmpeqq(allones, allones, allones, vlen_enc);
6832     }
6833   }
6834   if (elem_bt == T_LONG) {
6835     vpsrlq(dst, allones, 1, vlen_enc);
6836   } else {
6837     assert(elem_bt == T_INT, "");
6838     vpsrld(dst, allones, 1, vlen_enc);
6839   }
6840 }
6841 
6842 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6843   if (compute_allones) {
6844     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6845       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6846     } else {
6847       vpcmpeqq(allones, allones, allones, vlen_enc);
6848     }
6849   }
6850   if (elem_bt == T_LONG) {
6851     vpsllq(dst, allones, 63, vlen_enc);
6852   } else {
6853     assert(elem_bt == T_INT, "");
6854     vpslld(dst, allones, 31, vlen_enc);
6855   }
6856 }
6857 
6858 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask,  XMMRegister src1, XMMRegister src2,
6859                                 Assembler::ComparisonPredicate cond, int vlen_enc) {
6860   switch(elem_bt) {
6861     case T_LONG:  evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6862     case T_INT:   evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6863     case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6864     case T_BYTE:  evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6865     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6866   }
6867 }
6868 
6869 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6870   switch(elem_bt) {
6871     case  T_LONG:  vpcmpgtq(dst, src1, src2, vlen_enc); break;
6872     case  T_INT:   vpcmpgtd(dst, src1, src2, vlen_enc); break;
6873     case  T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6874     case  T_BYTE:  vpcmpgtb(dst, src1, src2, vlen_enc); break;
6875     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6876   }
6877 }
6878 
6879 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6880                                            XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6881   if (elem_bt == T_LONG) {
6882     evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6883   } else {
6884     assert(elem_bt == T_INT, "");
6885     evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6886   }
6887 }
6888 
6889 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6890                                                          XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6891                                                          KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6892   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6893   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6894   // Overflow detection based on Hacker's delight section 2-13.
6895   if (ideal_opc == Op_SaturatingAddV) {
6896     // res = src1 + src2
6897     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6898     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6899     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6900     vpxor(xtmp1, dst, src1, vlen_enc);
6901     vpxor(xtmp2, dst, src2, vlen_enc);
6902     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6903   } else {
6904     assert(ideal_opc == Op_SaturatingSubV, "");
6905     // res = src1 - src2
6906     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6907     // Overflow occurs when both inputs have opposite polarity and
6908     // result polarity does not comply with first input polarity.
6909     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6910     vpxor(xtmp1, src1, src2, vlen_enc);
6911     vpxor(xtmp2, dst, src1, vlen_enc);
6912     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6913   }
6914 
6915   // Compute overflow detection mask.
6916   evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6917   // Note: xtmp1 hold -1 in all its lanes after above call.
6918 
6919   // Compute mask based on first input polarity.
6920   evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6921 
6922   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6923   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6924 
6925   // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6926   // set bits in first input polarity mask holds a min value.
6927   evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6928   // Blend destination lanes with saturated values using overflow detection mask.
6929   evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6930 }
6931 
6932 
6933 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6934                                                         XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6935                                                         XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6936   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6937   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6938   // Overflow detection based on Hacker's delight section 2-13.
6939   if (ideal_opc == Op_SaturatingAddV) {
6940     // res = src1 + src2
6941     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6942     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6943     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6944     vpxor(xtmp1, dst, src1, vlen_enc);
6945     vpxor(xtmp2, dst, src2, vlen_enc);
6946     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6947   } else {
6948     assert(ideal_opc == Op_SaturatingSubV, "");
6949     // res = src1 - src2
6950     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6951     // Overflow occurs when both inputs have opposite polarity and
6952     // result polarity does not comply with first input polarity.
6953     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6954     vpxor(xtmp1, src1, src2, vlen_enc);
6955     vpxor(xtmp2, dst, src1, vlen_enc);
6956     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6957   }
6958 
6959   // Sign-extend to compute overflow detection mask.
6960   vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
6961 
6962   vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
6963   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
6964   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6965 
6966   // Compose saturating min/max vector using first input polarity mask.
6967   vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
6968   vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
6969 
6970   // Blend result with saturating vector using overflow detection mask.
6971   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6972 }
6973 
6974 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6975   switch(elem_bt) {
6976     case T_BYTE:
6977       if (ideal_opc == Op_SaturatingAddV) {
6978         vpaddsb(dst, src1, src2, vlen_enc);
6979       } else {
6980         assert(ideal_opc == Op_SaturatingSubV, "");
6981         vpsubsb(dst, src1, src2, vlen_enc);
6982       }
6983       break;
6984     case T_SHORT:
6985       if (ideal_opc == Op_SaturatingAddV) {
6986         vpaddsw(dst, src1, src2, vlen_enc);
6987       } else {
6988         assert(ideal_opc == Op_SaturatingSubV, "");
6989         vpsubsw(dst, src1, src2, vlen_enc);
6990       }
6991       break;
6992     default:
6993       fatal("Unsupported type %s", type2name(elem_bt));
6994       break;
6995   }
6996 }
6997 
6998 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6999   switch(elem_bt) {
7000     case T_BYTE:
7001       if (ideal_opc == Op_SaturatingAddV) {
7002         vpaddusb(dst, src1, src2, vlen_enc);
7003       } else {
7004         assert(ideal_opc == Op_SaturatingSubV, "");
7005         vpsubusb(dst, src1, src2, vlen_enc);
7006       }
7007       break;
7008     case T_SHORT:
7009       if (ideal_opc == Op_SaturatingAddV) {
7010         vpaddusw(dst, src1, src2, vlen_enc);
7011       } else {
7012         assert(ideal_opc == Op_SaturatingSubV, "");
7013         vpsubusw(dst, src1, src2, vlen_enc);
7014       }
7015       break;
7016     default:
7017       fatal("Unsupported type %s", type2name(elem_bt));
7018       break;
7019   }
7020 }
7021 
7022 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
7023                                                      XMMRegister src2, int vlen_enc) {
7024   switch(elem_bt) {
7025     case T_BYTE:
7026       evpermi2b(dst, src1, src2, vlen_enc);
7027       break;
7028     case T_SHORT:
7029       evpermi2w(dst, src1, src2, vlen_enc);
7030       break;
7031     case T_INT:
7032       evpermi2d(dst, src1, src2, vlen_enc);
7033       break;
7034     case T_LONG:
7035       evpermi2q(dst, src1, src2, vlen_enc);
7036       break;
7037     case T_FLOAT:
7038       evpermi2ps(dst, src1, src2, vlen_enc);
7039       break;
7040     case T_DOUBLE:
7041       evpermi2pd(dst, src1, src2, vlen_enc);
7042       break;
7043     default:
7044       fatal("Unsupported type %s", type2name(elem_bt));
7045       break;
7046   }
7047 }
7048 
7049 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
7050   if (is_unsigned) {
7051     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7052   } else {
7053     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7054   }
7055 }
7056 
7057 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
7058   if (is_unsigned) {
7059     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7060   } else {
7061     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7062   }
7063 }
7064 
7065 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
7066   switch(opcode) {
7067     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7068     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7069     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7070     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7071     default: assert(false, "%s", NodeClassNames[opcode]); break;
7072   }
7073 }
7074 
7075 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7076   switch(opcode) {
7077     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7078     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7079     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7080     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7081     default: assert(false, "%s", NodeClassNames[opcode]); break;
7082   }
7083 }
7084 
7085 void C2_MacroAssembler::sminmax_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7086                                      KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) {
7087   vminmax_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit);
7088 }
7089 
7090 void C2_MacroAssembler::sminmax_fp16_avx10_2(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7091                                              KRegister ktmp) {
7092   if (opcode == Op_MaxHF) {
7093     // dst = max(src1, src2)
7094     evminmaxsh(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MAX_COMPARE_SIGN);
7095   } else {
7096     assert(opcode == Op_MinHF, "");
7097     // dst = min(src1, src2)
7098     evminmaxsh(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MIN_COMPARE_SIGN);
7099   }
7100 }
7101 
7102 void C2_MacroAssembler::vminmax_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7103                                      KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
7104   if (opcode == Op_MaxVHF || opcode == Op_MaxHF) {
7105     // Move sign bits of src2 to mask register.
7106     evpmovw2m(ktmp, src2, vlen_enc);
7107     // xtmp1 = src2 < 0 ? src2 : src1
7108     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7109     // xtmp2 = src2 < 0 ? ? src1 : src2
7110     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7111     // Idea behind above swapping is to make seconds source operand a +ve value.
7112     // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in
7113     // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction,
7114     // the second source operand, either a NaN or a valid floating-point value, is returned
7115     // dst = max(xtmp1, xtmp2)
7116     evmaxph(dst, xtmp1, xtmp2, vlen_enc);
7117     // isNaN = is_unordered_quiet(xtmp1)
7118     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7119     // Final result is same as first source if its a NaN value,
7120     // in case second operand holds a NaN value then as per above semantics
7121     // result is same as second operand.
7122     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7123   } else {
7124     assert(opcode == Op_MinVHF || opcode == Op_MinHF, "");
7125     // Move sign bits of src1 to mask register.
7126     evpmovw2m(ktmp, src1, vlen_enc);
7127     // xtmp1 = src1 < 0 ? src2 : src1
7128     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7129     // xtmp2 = src1 < 0 ? src1 : src2
7130     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7131     // Idea behind above swapping is to make seconds source operand a -ve value.
7132     // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in
7133     // the second source operand is returned.
7134     // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN
7135     // or a valid floating-point value, is written to the result.
7136     // dst = min(xtmp1, xtmp2)
7137     evminph(dst, xtmp1, xtmp2, vlen_enc);
7138     // isNaN = is_unordered_quiet(xtmp1)
7139     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7140     // Final result is same as first source if its a NaN value,
7141     // in case second operand holds a NaN value then as per above semantics
7142     // result is same as second operand.
7143     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7144   }
7145 }
7146 
7147 void C2_MacroAssembler::vminmax_fp16_avx10_2(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7148                                              KRegister ktmp, int vlen_enc) {
7149   if (opcode == Op_MaxVHF) {
7150     // dst = max(src1, src2)
7151     evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vlen_enc);
7152   } else {
7153     assert(opcode == Op_MinVHF, "");
7154     // dst = min(src1, src2)
7155     evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vlen_enc);
7156   }
7157 }
7158 
7159 void C2_MacroAssembler::vminmax_fp16_avx10_2(int opcode, XMMRegister dst, XMMRegister src1, Address src2,
7160                                              KRegister ktmp, int vlen_enc) {
7161   if (opcode == Op_MaxVHF) {
7162     // dst = max(src1, src2)
7163     evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vlen_enc);
7164   } else {
7165     assert(opcode == Op_MinVHF, "");
7166     // dst = min(src1, src2)
7167     evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vlen_enc);
7168   }
7169 }
7170 
7171 int C2_MacroAssembler::vector_iota_entry_index(BasicType bt) {
7172   // The vector iota entries array is ordered by type B/S/I/L/F/D, and
7173   // the offset between two types is 16.
7174   switch(bt) {
7175   case T_BYTE:
7176     return 0;
7177   case T_SHORT:
7178     return 1;
7179   case T_INT:
7180     return 2;
7181   case T_LONG:
7182     return 3;
7183   case T_FLOAT:
7184     return 4;
7185   case T_DOUBLE:
7186     return 5;
7187   default:
7188     ShouldNotReachHere();
7189   }
7190 }