1 /*
   2  * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "../../share/runtime/globals.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "oops/methodData.hpp"
  31 #include "opto/c2_MacroAssembler.hpp"
  32 #include "opto/intrinsicnode.hpp"
  33 #include "opto/output.hpp"
  34 #include "opto/opcodes.hpp"
  35 #include "opto/subnode.hpp"
  36 #include "runtime/globals.hpp"
  37 #include "runtime/objectMonitor.hpp"
  38 #include "runtime/objectMonitorTable.hpp"
  39 #include "runtime/stubRoutines.hpp"
  40 #include "runtime/synchronizer.hpp"
  41 #include "utilities/checkedCast.hpp"
  42 #include "utilities/globalDefinitions.hpp"
  43 #include "utilities/powerOfTwo.hpp"
  44 #include "utilities/sizes.hpp"
  45 
  46 #ifdef PRODUCT
  47 #define BLOCK_COMMENT(str) /* nothing */
  48 #define STOP(error) stop(error)
  49 #else
  50 #define BLOCK_COMMENT(str) block_comment(str)
  51 #define STOP(error) block_comment(error); stop(error)
  52 #endif
  53 
  54 // C2 compiled method's prolog code.
  55 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
  56   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  57 
  58   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  59   // Remove word for return addr
  60   framesize -= wordSize;
  61   stack_bang_size -= wordSize;
  62 
  63   // Calls to C2R adapters often do not accept exceptional returns.
  64   // We require that their callers must bang for them.  But be careful, because
  65   // some VM calls (such as call site linkage) can use several kilobytes of
  66   // stack.  But the stack safety zone should account for that.
  67   // See bugs 4446381, 4468289, 4497237.
  68   if (stack_bang_size > 0) {
  69     generate_stack_overflow_check(stack_bang_size);
  70 
  71     // We always push rbp, so that on return to interpreter rbp, will be
  72     // restored correctly and we can correct the stack.
  73     push(rbp);
  74     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  75     if (PreserveFramePointer) {
  76       mov(rbp, rsp);
  77     }
  78     // Remove word for ebp
  79     framesize -= wordSize;
  80 
  81     // Create frame
  82     if (framesize) {
  83       subptr(rsp, framesize);
  84     }
  85   } else {
  86     subptr(rsp, framesize);
  87 
  88     // Save RBP register now.
  89     framesize -= wordSize;
  90     movptr(Address(rsp, framesize), rbp);
  91     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  92     if (PreserveFramePointer) {
  93       movptr(rbp, rsp);
  94       if (framesize > 0) {
  95         addptr(rbp, framesize);
  96       }
  97     }
  98   }
  99 
 100   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 101     framesize -= wordSize;
 102     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 103   }
 104 
 105 #ifdef ASSERT
 106   if (VerifyStackAtCalls) {
 107     Label L;
 108     push(rax);
 109     mov(rax, rsp);
 110     andptr(rax, StackAlignmentInBytes-1);
 111     cmpptr(rax, StackAlignmentInBytes-wordSize);
 112     pop(rax);
 113     jcc(Assembler::equal, L);
 114     STOP("Stack is not properly aligned!");
 115     bind(L);
 116   }
 117 #endif
 118 
 119   if (!is_stub) {
 120     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 121     // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 122     Label dummy_slow_path;
 123     Label dummy_continuation;
 124     Label* slow_path = &dummy_slow_path;
 125     Label* continuation = &dummy_continuation;
 126     if (!Compile::current()->output()->in_scratch_emit_size()) {
 127       // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 128       C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 129       Compile::current()->output()->add_stub(stub);
 130       slow_path = &stub->entry();
 131       continuation = &stub->continuation();
 132     }
 133     bs->nmethod_entry_barrier(this, slow_path, continuation);
 134   }
 135 }
 136 
 137 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 138   switch (vlen_in_bytes) {
 139     case  4: // fall-through
 140     case  8: // fall-through
 141     case 16: return Assembler::AVX_128bit;
 142     case 32: return Assembler::AVX_256bit;
 143     case 64: return Assembler::AVX_512bit;
 144 
 145     default: {
 146       ShouldNotReachHere();
 147       return Assembler::AVX_NoVec;
 148     }
 149   }
 150 }
 151 
 152 // fast_lock and fast_unlock used by C2
 153 
 154 // Because the transitions from emitted code to the runtime
 155 // monitorenter/exit helper stubs are so slow it's critical that
 156 // we inline both the stack-locking fast path and the inflated fast path.
 157 //
 158 // See also: cmpFastLock and cmpFastUnlock.
 159 //
 160 // What follows is a specialized inline transliteration of the code
 161 // in enter() and exit(). If we're concerned about I$ bloat another
 162 // option would be to emit TrySlowEnter and TrySlowExit methods
 163 // at startup-time.  These methods would accept arguments as
 164 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 165 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 166 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 167 // In practice, however, the # of lock sites is bounded and is usually small.
 168 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 169 // if the processor uses simple bimodal branch predictors keyed by EIP
 170 // Since the helper routines would be called from multiple synchronization
 171 // sites.
 172 //
 173 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 174 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 175 // to those specialized methods.  That'd give us a mostly platform-independent
 176 // implementation that the JITs could optimize and inline at their pleasure.
 177 // Done correctly, the only time we'd need to cross to native could would be
 178 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 179 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 180 // (b) explicit barriers or fence operations.
 181 //
 182 // TODO:
 183 //
 184 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 185 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 186 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 187 //    the lock operators would typically be faster than reifying Self.
 188 //
 189 // *  Ideally I'd define the primitives as:
 190 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 191 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 192 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 193 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 194 //    Furthermore the register assignments are overconstrained, possibly resulting in
 195 //    sub-optimal code near the synchronization site.
 196 //
 197 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 198 //    Alternately, use a better sp-proximity test.
 199 //
 200 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 201 //    Either one is sufficient to uniquely identify a thread.
 202 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 203 //
 204 // *  Intrinsify notify() and notifyAll() for the common cases where the
 205 //    object is locked by the calling thread but the waitlist is empty.
 206 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 207 //
 208 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 209 //    But beware of excessive branch density on AMD Opterons.
 210 //
 211 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 212 //    or failure of the fast path.  If the fast path fails then we pass
 213 //    control to the slow path, typically in C.  In fast_lock and
 214 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 215 //    will emit a conditional branch immediately after the node.
 216 //    So we have branches to branches and lots of ICC.ZF games.
 217 //    Instead, it might be better to have C2 pass a "FailureLabel"
 218 //    into fast_lock and fast_unlock.  In the case of success, control
 219 //    will drop through the node.  ICC.ZF is undefined at exit.
 220 //    In the case of failure, the node will branch directly to the
 221 //    FailureLabel
 222 
 223 // obj: object to lock
 224 // box: on-stack box address -- KILLED
 225 // rax: tmp -- KILLED
 226 // t  : tmp -- KILLED
 227 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register rax_reg,
 228                                   Register t, Register thread) {
 229   assert(rax_reg == rax, "Used for CAS");
 230   assert_different_registers(obj, box, rax_reg, t, thread);
 231 
 232   // Handle inflated monitor.
 233   Label inflated;
 234   // Finish fast lock successfully. ZF value is irrelevant.
 235   Label locked;
 236   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 237   Label slow_path;
 238 
 239   if (UseObjectMonitorTable) {
 240     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 241     movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
 242   }
 243 
 244   if (DiagnoseSyncOnValueBasedClasses != 0) {
 245     load_klass(rax_reg, obj, t);
 246     testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 247     jcc(Assembler::notZero, slow_path);
 248   }
 249 
 250   const Register mark = t;
 251 
 252   { // Fast Lock
 253 
 254     Label push;
 255 
 256     const Register top = UseObjectMonitorTable ? rax_reg : box;
 257 
 258     // Load the mark.
 259     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 260 
 261     // Prefetch top.
 262     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 263 
 264     // Check for monitor (0b10).
 265     testptr(mark, markWord::monitor_value);
 266     jcc(Assembler::notZero, inflated);
 267 
 268     // Check if lock-stack is full.
 269     cmpl(top, LockStack::end_offset() - 1);
 270     jcc(Assembler::greater, slow_path);
 271 
 272     // Check if recursive.
 273     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 274     jccb(Assembler::equal, push);
 275 
 276     // Try to lock. Transition lock bits 0b01 => 0b00
 277     movptr(rax_reg, mark);
 278     orptr(rax_reg, markWord::unlocked_value);
 279     andptr(mark, ~(int32_t)markWord::unlocked_value);
 280     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 281     jcc(Assembler::notEqual, slow_path);
 282 
 283     if (UseObjectMonitorTable) {
 284       // Need to reload top, clobbered by CAS.
 285       movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 286     }
 287     bind(push);
 288     // After successful lock, push object on lock-stack.
 289     movptr(Address(thread, top), obj);
 290     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 291     jmp(locked);
 292   }
 293 
 294   { // Handle inflated monitor.
 295     bind(inflated);
 296 
 297     const Register monitor = t;
 298 
 299     if (!UseObjectMonitorTable) {
 300       assert(mark == monitor, "should be the same here");
 301     } else {
 302       const Register hash = t;
 303       Label monitor_found;
 304 
 305       // Look for the monitor in the om_cache.
 306 
 307       ByteSize cache_offset   = JavaThread::om_cache_oops_offset();
 308       ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
 309       const int num_unrolled  = OMCache::CAPACITY;
 310       for (int i = 0; i < num_unrolled; i++) {
 311         movptr(monitor, Address(thread,  cache_offset + monitor_offset));
 312         cmpptr(obj, Address(thread, cache_offset));
 313         jccb(Assembler::equal, monitor_found);
 314         cache_offset = cache_offset + OMCache::oop_to_oop_difference();
 315       }
 316 
 317       if (UseCompactObjectHeaders) {
 318         // TODO: The fast-path table lookup currently doesn't work with Lilliput's
 319         // compact identity-hashcode implementation.
 320         // See: https://bugs.openjdk.org/browse/JDK-8380981
 321         jmp(slow_path);
 322       } else {
 323         // Look for the monitor in the table.
 324 
 325         // Get the hash code.
 326         movptr(hash, Address(obj, oopDesc::mark_offset_in_bytes()));
 327         shrq(hash, markWord::hash_shift);
 328         andq(hash, markWord::hash_mask);
 329 
 330         // Get the table and calculate the bucket's address.
 331         lea(rax_reg, ExternalAddress(ObjectMonitorTable::current_table_address()));
 332         movptr(rax_reg, Address(rax_reg));
 333         andq(hash, Address(rax_reg, ObjectMonitorTable::table_capacity_mask_offset()));
 334         movptr(rax_reg, Address(rax_reg, ObjectMonitorTable::table_buckets_offset()));
 335 
 336         // Read the monitor from the bucket.
 337         movptr(monitor, Address(rax_reg, hash, Address::times_ptr));
 338 
 339         // Check if the monitor in the bucket is special (empty, tombstone or removed)
 340         cmpptr(monitor, ObjectMonitorTable::SpecialPointerValues::below_is_special);
 341         jcc(Assembler::below, slow_path);
 342 
 343         // Check if object matches.
 344         movptr(rax_reg, Address(monitor, ObjectMonitor::object_offset()));
 345         BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 346         bs_asm->try_resolve_weak_handle_in_c2(this, rax_reg, slow_path);
 347         cmpptr(rax_reg, obj);
 348         jcc(Assembler::notEqual, slow_path);
 349       }
 350       bind(monitor_found);
 351     }
 352     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 353     const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 354     const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
 355 
 356     Label monitor_locked;
 357     // Lock the monitor.
 358 
 359     if (UseObjectMonitorTable) {
 360       // Cache the monitor for unlock before trashing box. On failure to acquire
 361       // the lock, the slow path will reset the entry accordingly (see CacheSetter).
 362       movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
 363     }
 364 
 365     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 366     xorptr(rax_reg, rax_reg);
 367     movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
 368     lock(); cmpxchgptr(box, owner_address);
 369     jccb(Assembler::equal, monitor_locked);
 370 
 371     // Check if recursive.
 372     cmpptr(box, rax_reg);
 373     jccb(Assembler::notEqual, slow_path);
 374 
 375     // Recursive.
 376     increment(recursions_address);
 377 
 378     bind(monitor_locked);
 379   }
 380 
 381   bind(locked);
 382   // Set ZF = 1
 383   xorl(rax_reg, rax_reg);
 384 
 385 #ifdef ASSERT
 386   // Check that locked label is reached with ZF set.
 387   Label zf_correct;
 388   Label zf_bad_zero;
 389   jcc(Assembler::zero, zf_correct);
 390   jmp(zf_bad_zero);
 391 #endif
 392 
 393   bind(slow_path);
 394 #ifdef ASSERT
 395   // Check that slow_path label is reached with ZF not set.
 396   jcc(Assembler::notZero, zf_correct);
 397   stop("Fast Lock ZF != 0");
 398   bind(zf_bad_zero);
 399   stop("Fast Lock ZF != 1");
 400   bind(zf_correct);
 401 #endif
 402   // C2 uses the value of ZF to determine the continuation.
 403 }
 404 
 405 // obj: object to lock
 406 // rax: tmp -- KILLED
 407 // t  : tmp - cannot be obj nor rax -- KILLED
 408 //
 409 // Some commentary on balanced locking:
 410 //
 411 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 412 // Methods that don't have provably balanced locking are forced to run in the
 413 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 414 // The interpreter provides two properties:
 415 // I1:  At return-time the interpreter automatically and quietly unlocks any
 416 //      objects acquired in the current activation (frame).  Recall that the
 417 //      interpreter maintains an on-stack list of locks currently held by
 418 //      a frame.
 419 // I2:  If a method attempts to unlock an object that is not held by the
 420 //      frame the interpreter throws IMSX.
 421 //
 422 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 423 // B() doesn't have provably balanced locking so it runs in the interpreter.
 424 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 425 // is still locked by A().
 426 //
 427 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface
 428 // Specification" states that an object locked by JNI's MonitorEnter should not be
 429 // unlocked by "normal" java-level locking and vice-versa.  The specification doesn't
 430 // specify what will occur if a program engages in such mixed-mode locking, however.
 431 // Arguably given that the spec legislates the JNI case as undefined our implementation
 432 // could reasonably *avoid* checking owner in fast_unlock().
 433 // In the interest of performance we elide m->Owner==Self check in unlock.
 434 // A perfectly viable alternative is to elide the owner check except when
 435 // Xcheck:jni is enabled.
 436 
 437 void C2_MacroAssembler::fast_unlock(Register obj, Register reg_rax, Register t, Register thread) {
 438   assert(reg_rax == rax, "Used for CAS");
 439   assert_different_registers(obj, reg_rax, t);
 440 
 441   // Handle inflated monitor.
 442   Label inflated, inflated_check_lock_stack;
 443   // Finish fast unlock successfully.  MUST jump with ZF == 1
 444   Label unlocked, slow_path;
 445 
 446   const Register mark = t;
 447   const Register monitor = t;
 448   const Register top = UseObjectMonitorTable ? t : reg_rax;
 449   const Register box = reg_rax;
 450 
 451   Label dummy;
 452   C2FastUnlockStub* stub = nullptr;
 453 
 454   if (!Compile::current()->output()->in_scratch_emit_size()) {
 455     stub = new (Compile::current()->comp_arena()) C2FastUnlockStub(obj, mark, reg_rax, thread);
 456     Compile::current()->output()->add_stub(stub);
 457   }
 458 
 459   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 460 
 461   { // Fast Unlock
 462 
 463     // Load top.
 464     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 465 
 466     if (!UseObjectMonitorTable) {
 467       // Prefetch mark.
 468       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 469     }
 470 
 471     // Check if obj is top of lock-stack.
 472     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 473     // Top of lock stack was not obj. Must be monitor.
 474     jcc(Assembler::notEqual, inflated_check_lock_stack);
 475 
 476     // Pop lock-stack.
 477     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 478     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 479 
 480     // Check if recursive.
 481     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 482     jcc(Assembler::equal, unlocked);
 483 
 484     // We elide the monitor check, let the CAS fail instead.
 485 
 486     if (UseObjectMonitorTable) {
 487       // Load mark.
 488       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 489     }
 490 
 491     // Try to unlock. Transition lock bits 0b00 => 0b01
 492     movptr(reg_rax, mark);
 493     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
 494     orptr(mark, markWord::unlocked_value);
 495     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 496     jcc(Assembler::notEqual, push_and_slow_path);
 497     jmp(unlocked);
 498   }
 499 
 500 
 501   { // Handle inflated monitor.
 502     bind(inflated_check_lock_stack);
 503 #ifdef ASSERT
 504     Label check_done;
 505     subl(top, oopSize);
 506     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
 507     jcc(Assembler::below, check_done);
 508     cmpptr(obj, Address(thread, top));
 509     jcc(Assembler::notEqual, inflated_check_lock_stack);
 510     stop("Fast Unlock lock on stack");
 511     bind(check_done);
 512     if (UseObjectMonitorTable) {
 513       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 514     }
 515     testptr(mark, markWord::monitor_value);
 516     jcc(Assembler::notZero, inflated);
 517     stop("Fast Unlock not monitor");
 518 #endif
 519 
 520     bind(inflated);
 521 
 522     if (!UseObjectMonitorTable) {
 523       assert(mark == monitor, "should be the same here");
 524     } else {
 525       // Uses ObjectMonitorTable.  Look for the monitor in our BasicLock on the stack.
 526       movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 527       // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
 528       cmpptr(monitor, alignof(ObjectMonitor*));
 529       jcc(Assembler::below, slow_path);
 530     }
 531     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 532     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
 533     const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
 534     const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag};
 535     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
 536 
 537     Label recursive;
 538 
 539     // Check if recursive.
 540     cmpptr(recursions_address, 0);
 541     jcc(Assembler::notZero, recursive);
 542 
 543     // Set owner to null.
 544     // Release to satisfy the JMM
 545     movptr(owner_address, NULL_WORD);
 546     // We need a full fence after clearing owner to avoid stranding.
 547     // StoreLoad achieves this.
 548     membar(StoreLoad);
 549 
 550     // Check if the entry_list is empty.
 551     cmpptr(entry_list_address, NULL_WORD);
 552     jcc(Assembler::zero, unlocked);    // If so we are done.
 553 
 554     // Check if there is a successor.
 555     cmpptr(succ_address, NULL_WORD);
 556     jcc(Assembler::notZero, unlocked); // If so we are done.
 557 
 558     // Save the monitor pointer in the current thread, so we can try to
 559     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 560     if (!UseObjectMonitorTable) {
 561       andptr(monitor, ~(int32_t)markWord::monitor_value);
 562     }
 563     movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
 564 
 565     orl(t, 1); // Fast Unlock ZF = 0
 566     jmpb(slow_path);
 567 
 568     // Recursive unlock.
 569     bind(recursive);
 570     decrement(recursions_address);
 571   }
 572 
 573   bind(unlocked);
 574   xorl(t, t); // Fast Unlock ZF = 1
 575 
 576 #ifdef ASSERT
 577   // Check that unlocked label is reached with ZF set.
 578   Label zf_correct;
 579   Label zf_bad_zero;
 580   jcc(Assembler::zero, zf_correct);
 581   jmp(zf_bad_zero);
 582 #endif
 583 
 584   bind(slow_path);
 585   if (stub != nullptr) {
 586     bind(stub->slow_path_continuation());
 587   }
 588 #ifdef ASSERT
 589   // Check that stub->continuation() label is reached with ZF not set.
 590   jcc(Assembler::notZero, zf_correct);
 591   stop("Fast Unlock ZF != 0");
 592   bind(zf_bad_zero);
 593   stop("Fast Unlock ZF != 1");
 594   bind(zf_correct);
 595 #endif
 596   // C2 uses the value of ZF to determine the continuation.
 597 }
 598 
 599 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
 600   fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
 601 }
 602 
 603 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) {
 604   const int framesize = Compile::current()->output()->frame_size_in_bytes();
 605   masm->movptr(dst, rsp);
 606   if (framesize > 2 * wordSize) {
 607     masm->addptr(dst, framesize - 2 * wordSize);
 608   }
 609 }
 610 
 611 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
 612   if (PreserveFramePointer) {
 613     // frame pointer is valid
 614 #ifdef ASSERT
 615     // Verify frame pointer value in rbp.
 616     reconstruct_frame_pointer_helper(this, rtmp);
 617     Label L_success;
 618     cmpq(rbp, rtmp);
 619     jccb(Assembler::equal, L_success);
 620     STOP("frame pointer mismatch");
 621     bind(L_success);
 622 #endif // ASSERT
 623   } else {
 624     reconstruct_frame_pointer_helper(this, rbp);
 625   }
 626 }
 627 
 628 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) {
 629   jint lo = t->_lo;
 630   jint hi = t->_hi;
 631   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi);
 632   if (t == TypeInt::INT) {
 633     return;
 634   }
 635 
 636   BLOCK_COMMENT("CastII {");
 637   Label fail;
 638   Label succeed;
 639 
 640   if (lo != min_jint) {
 641     cmpl(val, lo);
 642     jccb(Assembler::less, fail);
 643   }
 644   if (hi != max_jint) {
 645     cmpl(val, hi);
 646     jccb(Assembler::greater, fail);
 647   }
 648   jmpb(succeed);
 649 
 650   bind(fail);
 651   movl(c_rarg0, idx);
 652   movl(c_rarg1, val);
 653   movl(c_rarg2, lo);
 654   movl(c_rarg3, hi);
 655   reconstruct_frame_pointer(rscratch1);
 656   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range)));
 657   hlt();
 658   bind(succeed);
 659   BLOCK_COMMENT("} // CastII");
 660 }
 661 
 662 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
 663   fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
 664 }
 665 
 666 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) {
 667   jlong lo = t->_lo;
 668   jlong hi = t->_hi;
 669   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi);
 670   if (t == TypeLong::LONG) {
 671     return;
 672   }
 673 
 674   BLOCK_COMMENT("CastLL {");
 675   Label fail;
 676   Label succeed;
 677 
 678   auto cmp_val = [&](jlong bound) {
 679     if (is_simm32(bound)) {
 680       cmpq(val, checked_cast<int>(bound));
 681     } else {
 682       mov64(tmp, bound);
 683       cmpq(val, tmp);
 684     }
 685   };
 686 
 687   if (lo != min_jlong) {
 688     cmp_val(lo);
 689     jccb(Assembler::less, fail);
 690   }
 691   if (hi != max_jlong) {
 692     cmp_val(hi);
 693     jccb(Assembler::greater, fail);
 694   }
 695   jmpb(succeed);
 696 
 697   bind(fail);
 698   movl(c_rarg0, idx);
 699   movq(c_rarg1, val);
 700   mov64(c_rarg2, lo);
 701   mov64(c_rarg3, hi);
 702   reconstruct_frame_pointer(rscratch1);
 703   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range)));
 704   hlt();
 705   bind(succeed);
 706   BLOCK_COMMENT("} // CastLL");
 707 }
 708 
 709 //-------------------------------------------------------------------------------------------
 710 // Generic instructions support for use in .ad files C2 code generation
 711 
 712 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 713   if (dst != src) {
 714     movdqu(dst, src);
 715   }
 716   if (opcode == Op_AbsVD) {
 717     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 718   } else {
 719     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 720     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 721   }
 722 }
 723 
 724 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 725   if (opcode == Op_AbsVD) {
 726     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 727   } else {
 728     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 729     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 730   }
 731 }
 732 
 733 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 734   if (dst != src) {
 735     movdqu(dst, src);
 736   }
 737   if (opcode == Op_AbsVF) {
 738     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 739   } else {
 740     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 741     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 742   }
 743 }
 744 
 745 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 746   if (opcode == Op_AbsVF) {
 747     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 748   } else {
 749     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 750     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 751   }
 752 }
 753 
 754 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 755   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 756   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 757 
 758   if (opcode == Op_MinV) {
 759     if (elem_bt == T_BYTE) {
 760       pminsb(dst, src);
 761     } else if (elem_bt == T_SHORT) {
 762       pminsw(dst, src);
 763     } else if (elem_bt == T_INT) {
 764       pminsd(dst, src);
 765     } else {
 766       assert(elem_bt == T_LONG, "required");
 767       assert(tmp == xmm0, "required");
 768       assert_different_registers(dst, src, tmp);
 769       movdqu(xmm0, dst);
 770       pcmpgtq(xmm0, src);
 771       blendvpd(dst, src);  // xmm0 as mask
 772     }
 773   } else { // opcode == Op_MaxV
 774     if (elem_bt == T_BYTE) {
 775       pmaxsb(dst, src);
 776     } else if (elem_bt == T_SHORT) {
 777       pmaxsw(dst, src);
 778     } else if (elem_bt == T_INT) {
 779       pmaxsd(dst, src);
 780     } else {
 781       assert(elem_bt == T_LONG, "required");
 782       assert(tmp == xmm0, "required");
 783       assert_different_registers(dst, src, tmp);
 784       movdqu(xmm0, src);
 785       pcmpgtq(xmm0, dst);
 786       blendvpd(dst, src);  // xmm0 as mask
 787     }
 788   }
 789 }
 790 
 791 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 792                                   XMMRegister src1, Address src2, int vlen_enc) {
 793   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 794   if (opcode == Op_UMinV) {
 795     switch(elem_bt) {
 796       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 797       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 798       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 799       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 800       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 801     }
 802   } else {
 803     assert(opcode == Op_UMaxV, "required");
 804     switch(elem_bt) {
 805       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 806       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 807       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 808       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 809       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 810     }
 811   }
 812 }
 813 
 814 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
 815   // For optimality, leverage a full vector width of 512 bits
 816   // for operations over smaller vector sizes on AVX512 targets.
 817   if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 818     if (opcode == Op_UMaxV) {
 819       evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 820     } else {
 821       assert(opcode == Op_UMinV, "required");
 822       evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 823     }
 824   } else {
 825     // T1 = -1
 826     vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
 827     // T1 = -1 << 63
 828     vpsllq(xtmp1, xtmp1, 63, vlen_enc);
 829     // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
 830     vpaddq(xtmp2, xtmp1, src2, vlen_enc);
 831     // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
 832     vpaddq(xtmp1, xtmp1, src1, vlen_enc);
 833     // Mask = T2 > T1
 834     vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
 835     if (opcode == Op_UMaxV) {
 836       // Res = Mask ? Src2 : Src1
 837       vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
 838     } else {
 839       // Res = Mask ? Src1 : Src2
 840       vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
 841     }
 842   }
 843 }
 844 
 845 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 846                                   XMMRegister src1, XMMRegister src2, int vlen_enc) {
 847   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 848   if (opcode == Op_UMinV) {
 849     switch(elem_bt) {
 850       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 851       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 852       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 853       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 854       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 855     }
 856   } else {
 857     assert(opcode == Op_UMaxV, "required");
 858     switch(elem_bt) {
 859       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 860       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 861       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 862       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 863       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 864     }
 865   }
 866 }
 867 
 868 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 869                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 870                                  int vlen_enc) {
 871   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 872 
 873   if (opcode == Op_MinV) {
 874     if (elem_bt == T_BYTE) {
 875       vpminsb(dst, src1, src2, vlen_enc);
 876     } else if (elem_bt == T_SHORT) {
 877       vpminsw(dst, src1, src2, vlen_enc);
 878     } else if (elem_bt == T_INT) {
 879       vpminsd(dst, src1, src2, vlen_enc);
 880     } else {
 881       assert(elem_bt == T_LONG, "required");
 882       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 883         vpminsq(dst, src1, src2, vlen_enc);
 884       } else {
 885         assert_different_registers(dst, src1, src2);
 886         vpcmpgtq(dst, src1, src2, vlen_enc);
 887         vblendvpd(dst, src1, src2, dst, vlen_enc);
 888       }
 889     }
 890   } else { // opcode == Op_MaxV
 891     if (elem_bt == T_BYTE) {
 892       vpmaxsb(dst, src1, src2, vlen_enc);
 893     } else if (elem_bt == T_SHORT) {
 894       vpmaxsw(dst, src1, src2, vlen_enc);
 895     } else if (elem_bt == T_INT) {
 896       vpmaxsd(dst, src1, src2, vlen_enc);
 897     } else {
 898       assert(elem_bt == T_LONG, "required");
 899       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 900         vpmaxsq(dst, src1, src2, vlen_enc);
 901       } else {
 902         assert_different_registers(dst, src1, src2);
 903         vpcmpgtq(dst, src1, src2, vlen_enc);
 904         vblendvpd(dst, src2, src1, dst, vlen_enc);
 905       }
 906     }
 907   }
 908 }
 909 
 910 // Float/Double min max
 911 
 912 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
 913                                    XMMRegister dst, XMMRegister a, XMMRegister b,
 914                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
 915                                    int vlen_enc) {
 916   assert(UseAVX > 0, "required");
 917   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 918          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 919   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 920   assert_different_registers(a, tmp, atmp, btmp);
 921   assert_different_registers(b, tmp, atmp, btmp);
 922 
 923   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 924   bool is_double_word = is_double_word_type(elem_bt);
 925 
 926   /* Note on 'non-obvious' assembly sequence:
 927    *
 928    * While there are vminps/vmaxps instructions, there are two important differences between hardware
 929    * and Java on how they handle floats:
 930    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
 931    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
 932    *
 933    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
 934    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
 935    *                (only useful when signs differ, noop otherwise)
 936    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
 937 
 938    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
 939    *   btmp = (b < +0.0) ? a : b
 940    *   atmp = (b < +0.0) ? b : a
 941    *   Tmp  = Max_Float(atmp , btmp)
 942    *   Res  = (atmp == NaN) ? atmp : Tmp
 943    */
 944 
 945   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
 946   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
 947   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
 948   XMMRegister mask;
 949 
 950   if (!is_double_word && is_min) {
 951     mask = a;
 952     vblend = &MacroAssembler::vblendvps;
 953     vmaxmin = &MacroAssembler::vminps;
 954     vcmp = &MacroAssembler::vcmpps;
 955   } else if (!is_double_word && !is_min) {
 956     mask = b;
 957     vblend = &MacroAssembler::vblendvps;
 958     vmaxmin = &MacroAssembler::vmaxps;
 959     vcmp = &MacroAssembler::vcmpps;
 960   } else if (is_double_word && is_min) {
 961     mask = a;
 962     vblend = &MacroAssembler::vblendvpd;
 963     vmaxmin = &MacroAssembler::vminpd;
 964     vcmp = &MacroAssembler::vcmppd;
 965   } else {
 966     assert(is_double_word && !is_min, "sanity");
 967     mask = b;
 968     vblend = &MacroAssembler::vblendvpd;
 969     vmaxmin = &MacroAssembler::vmaxpd;
 970     vcmp = &MacroAssembler::vcmppd;
 971   }
 972 
 973   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
 974   XMMRegister maxmin, scratch;
 975   if (dst == btmp) {
 976     maxmin = btmp;
 977     scratch = tmp;
 978   } else {
 979     maxmin = tmp;
 980     scratch = btmp;
 981   }
 982 
 983   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
 984   if (precompute_mask && !is_double_word) {
 985     vpsrad(tmp, mask, 32, vlen_enc);
 986     mask = tmp;
 987   } else if (precompute_mask && is_double_word) {
 988     vpxor(tmp, tmp, tmp, vlen_enc);
 989     vpcmpgtq(tmp, tmp, mask, vlen_enc);
 990     mask = tmp;
 991   }
 992 
 993   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
 994   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
 995   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
 996   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 997   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
 998 }
 999 
1000 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1001                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1002                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1003                                     int vlen_enc) {
1004   assert(UseAVX > 2, "required");
1005   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1006          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1007   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1008   assert_different_registers(dst, a, atmp, btmp);
1009   assert_different_registers(dst, b, atmp, btmp);
1010 
1011   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1012   bool is_double_word = is_double_word_type(elem_bt);
1013   bool merge = true;
1014 
1015   if (!is_double_word && is_min) {
1016     evpmovd2m(ktmp, a, vlen_enc);
1017     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1018     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1019     vminps(dst, atmp, btmp, vlen_enc);
1020     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1021     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1022   } else if (!is_double_word && !is_min) {
1023     evpmovd2m(ktmp, b, vlen_enc);
1024     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1025     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1026     vmaxps(dst, atmp, btmp, vlen_enc);
1027     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1028     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1029   } else if (is_double_word && is_min) {
1030     evpmovq2m(ktmp, a, vlen_enc);
1031     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1032     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1033     vminpd(dst, atmp, btmp, vlen_enc);
1034     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1035     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1036   } else {
1037     assert(is_double_word && !is_min, "sanity");
1038     evpmovq2m(ktmp, b, vlen_enc);
1039     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1040     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1041     vmaxpd(dst, atmp, btmp, vlen_enc);
1042     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1043     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1044   }
1045 }
1046 
1047 void C2_MacroAssembler::vminmax_fp(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1048                                    XMMRegister src1, XMMRegister src2, int vlen_enc) {
1049   assert(opc == Op_MinV || opc == Op_MinReductionV ||
1050          opc == Op_MaxV || opc == Op_MaxReductionV, "sanity");
1051 
1052   int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_2_MINMAX_MIN_COMPARE_SIGN
1053                                                          : AVX10_2_MINMAX_MAX_COMPARE_SIGN;
1054   if (elem_bt == T_FLOAT) {
1055     evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc);
1056   } else {
1057     assert(elem_bt == T_DOUBLE, "");
1058     evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc);
1059   }
1060 }
1061 
1062 // Float/Double signum
1063 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1064   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1065 
1066   Label DONE_LABEL;
1067 
1068   // Handle special cases +0.0/-0.0 and NaN, if argument is +0.0/-0.0 or NaN, return argument
1069   // If AVX10.2 (or newer) floating point comparison instructions used, SF=1 for equal and unordered cases
1070   // If other floating point comparison instructions used, ZF=1 for equal and unordered cases
1071   if (opcode == Op_SignumF) {
1072     if (VM_Version::supports_avx10_2()) {
1073       vucomxss(dst, zero);
1074       jcc(Assembler::negative, DONE_LABEL);
1075     } else {
1076       ucomiss(dst, zero);
1077       jcc(Assembler::equal, DONE_LABEL);
1078     }
1079     movflt(dst, one);
1080     jcc(Assembler::above, DONE_LABEL);
1081     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1082   } else if (opcode == Op_SignumD) {
1083     if (VM_Version::supports_avx10_2()) {
1084       vucomxsd(dst, zero);
1085       jcc(Assembler::negative, DONE_LABEL);
1086     } else {
1087       ucomisd(dst, zero);
1088       jcc(Assembler::equal, DONE_LABEL);
1089     }
1090     movdbl(dst, one);
1091     jcc(Assembler::above, DONE_LABEL);
1092     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1093   }
1094 
1095   bind(DONE_LABEL);
1096 }
1097 
1098 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1099   if (sign) {
1100     pmovsxbw(dst, src);
1101   } else {
1102     pmovzxbw(dst, src);
1103   }
1104 }
1105 
1106 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1107   if (sign) {
1108     vpmovsxbw(dst, src, vector_len);
1109   } else {
1110     vpmovzxbw(dst, src, vector_len);
1111   }
1112 }
1113 
1114 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1115   if (sign) {
1116     vpmovsxbd(dst, src, vector_len);
1117   } else {
1118     vpmovzxbd(dst, src, vector_len);
1119   }
1120 }
1121 
1122 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1123   if (sign) {
1124     vpmovsxwd(dst, src, vector_len);
1125   } else {
1126     vpmovzxwd(dst, src, vector_len);
1127   }
1128 }
1129 
1130 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1131                                      int shift, int vector_len) {
1132   if (opcode == Op_RotateLeftV) {
1133     if (etype == T_INT) {
1134       evprold(dst, src, shift, vector_len);
1135     } else {
1136       assert(etype == T_LONG, "expected type T_LONG");
1137       evprolq(dst, src, shift, vector_len);
1138     }
1139   } else {
1140     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1141     if (etype == T_INT) {
1142       evprord(dst, src, shift, vector_len);
1143     } else {
1144       assert(etype == T_LONG, "expected type T_LONG");
1145       evprorq(dst, src, shift, vector_len);
1146     }
1147   }
1148 }
1149 
1150 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1151                                      XMMRegister shift, int vector_len) {
1152   if (opcode == Op_RotateLeftV) {
1153     if (etype == T_INT) {
1154       evprolvd(dst, src, shift, vector_len);
1155     } else {
1156       assert(etype == T_LONG, "expected type T_LONG");
1157       evprolvq(dst, src, shift, vector_len);
1158     }
1159   } else {
1160     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1161     if (etype == T_INT) {
1162       evprorvd(dst, src, shift, vector_len);
1163     } else {
1164       assert(etype == T_LONG, "expected type T_LONG");
1165       evprorvq(dst, src, shift, vector_len);
1166     }
1167   }
1168 }
1169 
1170 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1171   if (opcode == Op_RShiftVI) {
1172     psrad(dst, shift);
1173   } else if (opcode == Op_LShiftVI) {
1174     pslld(dst, shift);
1175   } else {
1176     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1177     psrld(dst, shift);
1178   }
1179 }
1180 
1181 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1182   switch (opcode) {
1183     case Op_RShiftVI:  psrad(dst, shift); break;
1184     case Op_LShiftVI:  pslld(dst, shift); break;
1185     case Op_URShiftVI: psrld(dst, shift); break;
1186 
1187     default: assert(false, "%s", NodeClassNames[opcode]);
1188   }
1189 }
1190 
1191 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1192   if (opcode == Op_RShiftVI) {
1193     vpsrad(dst, nds, shift, vector_len);
1194   } else if (opcode == Op_LShiftVI) {
1195     vpslld(dst, nds, shift, vector_len);
1196   } else {
1197     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1198     vpsrld(dst, nds, shift, vector_len);
1199   }
1200 }
1201 
1202 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1203   switch (opcode) {
1204     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1205     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1206     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1207 
1208     default: assert(false, "%s", NodeClassNames[opcode]);
1209   }
1210 }
1211 
1212 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1213   switch (opcode) {
1214     case Op_RShiftVB:  // fall-through
1215     case Op_RShiftVS:  psraw(dst, shift); break;
1216 
1217     case Op_LShiftVB:  // fall-through
1218     case Op_LShiftVS:  psllw(dst, shift);   break;
1219 
1220     case Op_URShiftVS: // fall-through
1221     case Op_URShiftVB: psrlw(dst, shift);  break;
1222 
1223     default: assert(false, "%s", NodeClassNames[opcode]);
1224   }
1225 }
1226 
1227 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1228   switch (opcode) {
1229     case Op_RShiftVB:  // fall-through
1230     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1231 
1232     case Op_LShiftVB:  // fall-through
1233     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1234 
1235     case Op_URShiftVS: // fall-through
1236     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1237 
1238     default: assert(false, "%s", NodeClassNames[opcode]);
1239   }
1240 }
1241 
1242 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1243   switch (opcode) {
1244     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1245     case Op_LShiftVL:  psllq(dst, shift); break;
1246     case Op_URShiftVL: psrlq(dst, shift); break;
1247 
1248     default: assert(false, "%s", NodeClassNames[opcode]);
1249   }
1250 }
1251 
1252 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1253   if (opcode == Op_RShiftVL) {
1254     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1255   } else if (opcode == Op_LShiftVL) {
1256     psllq(dst, shift);
1257   } else {
1258     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1259     psrlq(dst, shift);
1260   }
1261 }
1262 
1263 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1264   switch (opcode) {
1265     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1266     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1267     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1268 
1269     default: assert(false, "%s", NodeClassNames[opcode]);
1270   }
1271 }
1272 
1273 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1274   if (opcode == Op_RShiftVL) {
1275     evpsraq(dst, nds, shift, vector_len);
1276   } else if (opcode == Op_LShiftVL) {
1277     vpsllq(dst, nds, shift, vector_len);
1278   } else {
1279     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1280     vpsrlq(dst, nds, shift, vector_len);
1281   }
1282 }
1283 
1284 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1285   switch (opcode) {
1286     case Op_RShiftVB:  // fall-through
1287     case Op_RShiftVS:  // fall-through
1288     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1289 
1290     case Op_LShiftVB:  // fall-through
1291     case Op_LShiftVS:  // fall-through
1292     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1293 
1294     case Op_URShiftVB: // fall-through
1295     case Op_URShiftVS: // fall-through
1296     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1297 
1298     default: assert(false, "%s", NodeClassNames[opcode]);
1299   }
1300 }
1301 
1302 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1303   switch (opcode) {
1304     case Op_RShiftVB:  // fall-through
1305     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1306 
1307     case Op_LShiftVB:  // fall-through
1308     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1309 
1310     case Op_URShiftVB: // fall-through
1311     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1312 
1313     default: assert(false, "%s", NodeClassNames[opcode]);
1314   }
1315 }
1316 
1317 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1318   assert(UseAVX >= 2, "required");
1319   switch (opcode) {
1320     case Op_RShiftVL: {
1321       if (UseAVX > 2) {
1322         assert(tmp == xnoreg, "not used");
1323         if (!VM_Version::supports_avx512vl()) {
1324           vlen_enc = Assembler::AVX_512bit;
1325         }
1326         evpsravq(dst, src, shift, vlen_enc);
1327       } else {
1328         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1329         vpsrlvq(dst, src, shift, vlen_enc);
1330         vpsrlvq(tmp, tmp, shift, vlen_enc);
1331         vpxor(dst, dst, tmp, vlen_enc);
1332         vpsubq(dst, dst, tmp, vlen_enc);
1333       }
1334       break;
1335     }
1336     case Op_LShiftVL: {
1337       assert(tmp == xnoreg, "not used");
1338       vpsllvq(dst, src, shift, vlen_enc);
1339       break;
1340     }
1341     case Op_URShiftVL: {
1342       assert(tmp == xnoreg, "not used");
1343       vpsrlvq(dst, src, shift, vlen_enc);
1344       break;
1345     }
1346     default: assert(false, "%s", NodeClassNames[opcode]);
1347   }
1348 }
1349 
1350 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1351 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1352   assert(opcode == Op_LShiftVB ||
1353          opcode == Op_RShiftVB ||
1354          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1355   bool sign = (opcode != Op_URShiftVB);
1356   assert(vector_len == 0, "required");
1357   vextendbd(sign, dst, src, 1);
1358   vpmovzxbd(vtmp, shift, 1);
1359   varshiftd(opcode, dst, dst, vtmp, 1);
1360   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1361   vextracti128_high(vtmp, dst);
1362   vpackusdw(dst, dst, vtmp, 0);
1363 }
1364 
1365 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1366 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1367   assert(opcode == Op_LShiftVB ||
1368          opcode == Op_RShiftVB ||
1369          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1370   bool sign = (opcode != Op_URShiftVB);
1371   int ext_vector_len = vector_len + 1;
1372   vextendbw(sign, dst, src, ext_vector_len);
1373   vpmovzxbw(vtmp, shift, ext_vector_len);
1374   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1375   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1376   if (vector_len == 0) {
1377     vextracti128_high(vtmp, dst);
1378     vpackuswb(dst, dst, vtmp, vector_len);
1379   } else {
1380     vextracti64x4_high(vtmp, dst);
1381     vpackuswb(dst, dst, vtmp, vector_len);
1382     vpermq(dst, dst, 0xD8, vector_len);
1383   }
1384 }
1385 
1386 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1387   switch(typ) {
1388     case T_BYTE:
1389       pinsrb(dst, val, idx);
1390       break;
1391     case T_SHORT:
1392       pinsrw(dst, val, idx);
1393       break;
1394     case T_INT:
1395       pinsrd(dst, val, idx);
1396       break;
1397     case T_LONG:
1398       pinsrq(dst, val, idx);
1399       break;
1400     default:
1401       assert(false,"Should not reach here.");
1402       break;
1403   }
1404 }
1405 
1406 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1407   switch(typ) {
1408     case T_BYTE:
1409       vpinsrb(dst, src, val, idx);
1410       break;
1411     case T_SHORT:
1412       vpinsrw(dst, src, val, idx);
1413       break;
1414     case T_INT:
1415       vpinsrd(dst, src, val, idx);
1416       break;
1417     case T_LONG:
1418       vpinsrq(dst, src, val, idx);
1419       break;
1420     default:
1421       assert(false,"Should not reach here.");
1422       break;
1423   }
1424 }
1425 
1426 void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst,
1427                                          Register base, Register idx_base,
1428                                          Register mask, Register mask_idx,
1429                                          Register rtmp, int vlen_enc) {
1430   vpxor(dst, dst, dst, vlen_enc);
1431   if (elem_bt == T_SHORT) {
1432     for (int i = 0; i < 4; i++) {
1433       // dst[i] = mask[i] ? src[idx_base[i]] : 0
1434       Label skip_load;
1435       btq(mask, mask_idx);
1436       jccb(Assembler::carryClear, skip_load);
1437       movl(rtmp, Address(idx_base, i * 4));
1438       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1439       bind(skip_load);
1440       incq(mask_idx);
1441     }
1442   } else {
1443     assert(elem_bt == T_BYTE, "");
1444     for (int i = 0; i < 8; i++) {
1445       // dst[i] = mask[i] ? src[idx_base[i]] : 0
1446       Label skip_load;
1447       btq(mask, mask_idx);
1448       jccb(Assembler::carryClear, skip_load);
1449       movl(rtmp, Address(idx_base, i * 4));
1450       pinsrb(dst, Address(base, rtmp), i);
1451       bind(skip_load);
1452       incq(mask_idx);
1453     }
1454   }
1455 }
1456 
1457 void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst,
1458                                   Register base, Register idx_base,
1459                                   Register rtmp, int vlen_enc) {
1460   vpxor(dst, dst, dst, vlen_enc);
1461   if (elem_bt == T_SHORT) {
1462     for (int i = 0; i < 4; i++) {
1463       // dst[i] = src[idx_base[i]]
1464       movl(rtmp, Address(idx_base, i * 4));
1465       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1466     }
1467   } else {
1468     assert(elem_bt == T_BYTE, "");
1469     for (int i = 0; i < 8; i++) {
1470       // dst[i] = src[idx_base[i]]
1471       movl(rtmp, Address(idx_base, i * 4));
1472       pinsrb(dst, Address(base, rtmp), i);
1473     }
1474   }
1475 }
1476 
1477 /*
1478  * Gather using hybrid algorithm, first partially unroll scalar loop
1479  * to accumulate values from gather indices into a quad-word(64bit) slice.
1480  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1481  * permutation to place the slice into appropriate vector lane
1482  * locations in destination vector. Following pseudo code describes the
1483  * algorithm in detail:
1484  *
1485  * DST_VEC = ZERO_VEC
1486  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1487  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1488  * FOREACH_ITER:
1489  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1490  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1491  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1492  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1493  *
1494  * With each iteration, doubleword permute indices (0,1) corresponding
1495  * to gathered quadword gets right shifted by two lane positions.
1496  *
1497  */
1498 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1499                                         Register base, Register idx_base,
1500                                         Register mask, XMMRegister xtmp1,
1501                                         XMMRegister xtmp2, XMMRegister temp_dst,
1502                                         Register rtmp, Register mask_idx,
1503                                         Register length, int vector_len, int vlen_enc) {
1504   Label GATHER8_LOOP;
1505   assert(is_subword_type(elem_ty), "");
1506   movl(length, vector_len);
1507   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1508   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1509   vallones(xtmp2, vlen_enc);
1510   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1511   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1512   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1513 
1514   bind(GATHER8_LOOP);
1515     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1516     if (mask == noreg) {
1517       vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc);
1518     } else {
1519       vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc);
1520     }
1521     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1522     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1523     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1524     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1525     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1526     vpor(dst, dst, temp_dst, vlen_enc);
1527     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1528     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1529     jcc(Assembler::notEqual, GATHER8_LOOP);
1530 }
1531 
1532 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1533   switch(typ) {
1534     case T_INT:
1535       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1536       break;
1537     case T_FLOAT:
1538       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1539       break;
1540     case T_LONG:
1541       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1542       break;
1543     case T_DOUBLE:
1544       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1545       break;
1546     default:
1547       assert(false,"Should not reach here.");
1548       break;
1549   }
1550 }
1551 
1552 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1553   switch(typ) {
1554     case T_INT:
1555       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1556       break;
1557     case T_FLOAT:
1558       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1559       break;
1560     case T_LONG:
1561       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1562       break;
1563     case T_DOUBLE:
1564       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1565       break;
1566     default:
1567       assert(false,"Should not reach here.");
1568       break;
1569   }
1570 }
1571 
1572 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1573   switch(typ) {
1574     case T_INT:
1575       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1576       break;
1577     case T_FLOAT:
1578       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1579       break;
1580     case T_LONG:
1581       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1582       break;
1583     case T_DOUBLE:
1584       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1585       break;
1586     default:
1587       assert(false,"Should not reach here.");
1588       break;
1589   }
1590 }
1591 
1592 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1593   if (vlen_in_bytes <= 16) {
1594     pxor (dst, dst);
1595     psubb(dst, src);
1596     switch (elem_bt) {
1597       case T_BYTE:   /* nothing to do */ break;
1598       case T_SHORT:  pmovsxbw(dst, dst); break;
1599       case T_INT:    pmovsxbd(dst, dst); break;
1600       case T_FLOAT:  pmovsxbd(dst, dst); break;
1601       case T_LONG:   pmovsxbq(dst, dst); break;
1602       case T_DOUBLE: pmovsxbq(dst, dst); break;
1603 
1604       default: assert(false, "%s", type2name(elem_bt));
1605     }
1606   } else {
1607     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1608     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1609 
1610     vpxor (dst, dst, dst, vlen_enc);
1611     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1612 
1613     switch (elem_bt) {
1614       case T_BYTE:   /* nothing to do */            break;
1615       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1616       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1617       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1618       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1619       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1620 
1621       default: assert(false, "%s", type2name(elem_bt));
1622     }
1623   }
1624 }
1625 
1626 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1627   if (novlbwdq) {
1628     vpmovsxbd(xtmp, src, vlen_enc);
1629     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1630             Assembler::eq, true, vlen_enc, noreg);
1631   } else {
1632     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1633     vpsubb(xtmp, xtmp, src, vlen_enc);
1634     evpmovb2m(dst, xtmp, vlen_enc);
1635   }
1636 }
1637 
1638 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1639   if (is_integral_type(bt)) {
1640     switch (vlen_in_bytes) {
1641       case 4:  movdl(dst, src);   break;
1642       case 8:  movq(dst, src);    break;
1643       case 16: movdqu(dst, src);  break;
1644       case 32: vmovdqu(dst, src); break;
1645       case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1646       default: ShouldNotReachHere();
1647     }
1648   } else {
1649     switch (vlen_in_bytes) {
1650       case 4:  movflt(dst, src); break;
1651       case 8:  movdbl(dst, src); break;
1652       case 16: movups(dst, src); break;
1653       case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1654       case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1655       default: ShouldNotReachHere();
1656     }
1657   }
1658 }
1659 
1660 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1661   assert(rscratch != noreg || always_reachable(src), "missing");
1662 
1663   if (reachable(src)) {
1664     load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1665   } else {
1666     lea(rscratch, src);
1667     load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1668   }
1669 }
1670 
1671 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1672   int vlen_enc = vector_length_encoding(vlen);
1673   if (VM_Version::supports_avx()) {
1674     if (bt == T_LONG) {
1675       if (VM_Version::supports_avx2()) {
1676         vpbroadcastq(dst, src, vlen_enc);
1677       } else {
1678         vmovddup(dst, src, vlen_enc);
1679       }
1680     } else if (bt == T_DOUBLE) {
1681       if (vlen_enc != Assembler::AVX_128bit) {
1682         vbroadcastsd(dst, src, vlen_enc, noreg);
1683       } else {
1684         vmovddup(dst, src, vlen_enc);
1685       }
1686     } else {
1687       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1688         vpbroadcastd(dst, src, vlen_enc);
1689       } else {
1690         vbroadcastss(dst, src, vlen_enc);
1691       }
1692     }
1693   } else if (VM_Version::supports_sse3()) {
1694     movddup(dst, src);
1695   } else {
1696     load_vector(bt, dst, src, vlen);
1697   }
1698 }
1699 
1700 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1701   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1702   int offset = exact_log2(type2aelembytes(bt)) << 6;
1703   if (is_floating_point_type(bt)) {
1704     offset += 128;
1705   }
1706   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1707   load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1708 }
1709 
1710 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1711 
1712 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1713   int vector_len = Assembler::AVX_128bit;
1714 
1715   switch (opcode) {
1716     case Op_AndReductionV:  pand(dst, src); break;
1717     case Op_OrReductionV:   por (dst, src); break;
1718     case Op_XorReductionV:  pxor(dst, src); break;
1719     case Op_MinReductionV:
1720       switch (typ) {
1721         case T_BYTE:        pminsb(dst, src); break;
1722         case T_SHORT:       pminsw(dst, src); break;
1723         case T_INT:         pminsd(dst, src); break;
1724         case T_LONG:        assert(UseAVX > 2, "required");
1725                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1726         default:            assert(false, "wrong type");
1727       }
1728       break;
1729     case Op_MaxReductionV:
1730       switch (typ) {
1731         case T_BYTE:        pmaxsb(dst, src); break;
1732         case T_SHORT:       pmaxsw(dst, src); break;
1733         case T_INT:         pmaxsd(dst, src); break;
1734         case T_LONG:        assert(UseAVX > 2, "required");
1735                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1736         default:            assert(false, "wrong type");
1737       }
1738       break;
1739     case Op_UMinReductionV:
1740       switch (typ) {
1741         case T_BYTE:        vpminub(dst, dst, src, Assembler::AVX_128bit); break;
1742         case T_SHORT:       vpminuw(dst, dst, src, Assembler::AVX_128bit); break;
1743         case T_INT:         vpminud(dst, dst, src, Assembler::AVX_128bit); break;
1744         case T_LONG:        evpminuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break;
1745         default:            assert(false, "wrong type");
1746       }
1747       break;
1748     case Op_UMaxReductionV:
1749       switch (typ) {
1750         case T_BYTE:        vpmaxub(dst, dst, src, Assembler::AVX_128bit); break;
1751         case T_SHORT:       vpmaxuw(dst, dst, src, Assembler::AVX_128bit); break;
1752         case T_INT:         vpmaxud(dst, dst, src, Assembler::AVX_128bit); break;
1753         case T_LONG:        evpmaxuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break;
1754         default:            assert(false, "wrong type");
1755       }
1756       break;
1757     case Op_AddReductionVF: addss(dst, src); break;
1758     case Op_AddReductionVD: addsd(dst, src); break;
1759     case Op_AddReductionVI:
1760       switch (typ) {
1761         case T_BYTE:        paddb(dst, src); break;
1762         case T_SHORT:       paddw(dst, src); break;
1763         case T_INT:         paddd(dst, src); break;
1764         default:            assert(false, "wrong type");
1765       }
1766       break;
1767     case Op_AddReductionVL: paddq(dst, src); break;
1768     case Op_MulReductionVF: mulss(dst, src); break;
1769     case Op_MulReductionVD: mulsd(dst, src); break;
1770     case Op_MulReductionVI:
1771       switch (typ) {
1772         case T_SHORT:       pmullw(dst, src); break;
1773         case T_INT:         pmulld(dst, src); break;
1774         default:            assert(false, "wrong type");
1775       }
1776       break;
1777     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1778                             evpmullq(dst, dst, src, vector_len); break;
1779     default:                assert(false, "wrong opcode");
1780   }
1781 }
1782 
1783 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1784   switch (opcode) {
1785     case Op_AddReductionVF: addps(dst, src); break;
1786     case Op_AddReductionVD: addpd(dst, src); break;
1787     case Op_MulReductionVF: mulps(dst, src); break;
1788     case Op_MulReductionVD: mulpd(dst, src); break;
1789     default:                assert(false, "%s", NodeClassNames[opcode]);
1790   }
1791 }
1792 
1793 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1794   int vector_len = Assembler::AVX_256bit;
1795 
1796   switch (opcode) {
1797     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1798     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1799     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1800     case Op_MinReductionV:
1801       switch (typ) {
1802         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1803         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1804         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1805         case T_LONG:        assert(UseAVX > 2, "required");
1806                             vpminsq(dst, src1, src2, vector_len); break;
1807         default:            assert(false, "wrong type");
1808       }
1809       break;
1810     case Op_MaxReductionV:
1811       switch (typ) {
1812         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1813         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1814         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1815         case T_LONG:        assert(UseAVX > 2, "required");
1816                             vpmaxsq(dst, src1, src2, vector_len); break;
1817         default:            assert(false, "wrong type");
1818       }
1819       break;
1820     case Op_UMinReductionV:
1821       switch (typ) {
1822         case T_BYTE:        vpminub(dst, src1, src2, vector_len); break;
1823         case T_SHORT:       vpminuw(dst, src1, src2, vector_len); break;
1824         case T_INT:         vpminud(dst, src1, src2, vector_len); break;
1825         case T_LONG:        evpminuq(dst, k0, src1, src2, true, vector_len); break;
1826         default:            assert(false, "wrong type");
1827       }
1828       break;
1829     case Op_UMaxReductionV:
1830       switch (typ) {
1831         case T_BYTE:        vpmaxub(dst, src1, src2, vector_len); break;
1832         case T_SHORT:       vpmaxuw(dst, src1, src2, vector_len); break;
1833         case T_INT:         vpmaxud(dst, src1, src2, vector_len); break;
1834         case T_LONG:        evpmaxuq(dst, k0, src1, src2, true, vector_len); break;
1835         default:            assert(false, "wrong type");
1836       }
1837       break;
1838     case Op_AddReductionVI:
1839       switch (typ) {
1840         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1841         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1842         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1843         default:            assert(false, "wrong type");
1844       }
1845       break;
1846     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1847     case Op_MulReductionVI:
1848       switch (typ) {
1849         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1850         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1851         default:            assert(false, "wrong type");
1852       }
1853       break;
1854     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1855     default:                assert(false, "wrong opcode");
1856   }
1857 }
1858 
1859 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1860   int vector_len = Assembler::AVX_256bit;
1861 
1862   switch (opcode) {
1863     case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1864     case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1865     case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1866     case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1867     default:                assert(false, "%s", NodeClassNames[opcode]);
1868   }
1869 }
1870 
1871 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1872                                   XMMRegister dst, XMMRegister src,
1873                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1874   switch (opcode) {
1875     case Op_AddReductionVF:
1876     case Op_MulReductionVF:
1877       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1878       break;
1879 
1880     case Op_AddReductionVD:
1881     case Op_MulReductionVD:
1882       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1883       break;
1884 
1885     default: assert(false, "wrong opcode");
1886   }
1887 }
1888 
1889 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1890                                             XMMRegister dst, XMMRegister src,
1891                                             XMMRegister vtmp1, XMMRegister vtmp2) {
1892   switch (opcode) {
1893     case Op_AddReductionVF:
1894     case Op_MulReductionVF:
1895       unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1896       break;
1897 
1898     case Op_AddReductionVD:
1899     case Op_MulReductionVD:
1900       unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1901       break;
1902 
1903     default: assert(false, "%s", NodeClassNames[opcode]);
1904   }
1905 }
1906 
1907 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1908                              Register dst, Register src1, XMMRegister src2,
1909                              XMMRegister vtmp1, XMMRegister vtmp2) {
1910   switch (vlen) {
1911     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1912     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1913     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1914     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1915 
1916     default: assert(false, "wrong vector length");
1917   }
1918 }
1919 
1920 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1921                              Register dst, Register src1, XMMRegister src2,
1922                              XMMRegister vtmp1, XMMRegister vtmp2) {
1923   switch (vlen) {
1924     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1925     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1926     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1927     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1928 
1929     default: assert(false, "wrong vector length");
1930   }
1931 }
1932 
1933 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1934                              Register dst, Register src1, XMMRegister src2,
1935                              XMMRegister vtmp1, XMMRegister vtmp2) {
1936   switch (vlen) {
1937     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1938     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1939     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1940     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1941 
1942     default: assert(false, "wrong vector length");
1943   }
1944 }
1945 
1946 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1947                              Register dst, Register src1, XMMRegister src2,
1948                              XMMRegister vtmp1, XMMRegister vtmp2) {
1949   switch (vlen) {
1950     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1951     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1952     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1953     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1954 
1955     default: assert(false, "wrong vector length");
1956   }
1957 }
1958 
1959 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1960                              Register dst, Register src1, XMMRegister src2,
1961                              XMMRegister vtmp1, XMMRegister vtmp2) {
1962   switch (vlen) {
1963     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1964     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1965     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1966 
1967     default: assert(false, "wrong vector length");
1968   }
1969 }
1970 
1971 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1972   switch (vlen) {
1973     case 2:
1974       assert(vtmp2 == xnoreg, "");
1975       reduce2F(opcode, dst, src, vtmp1);
1976       break;
1977     case 4:
1978       assert(vtmp2 == xnoreg, "");
1979       reduce4F(opcode, dst, src, vtmp1);
1980       break;
1981     case 8:
1982       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1983       break;
1984     case 16:
1985       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1986       break;
1987     default: assert(false, "wrong vector length");
1988   }
1989 }
1990 
1991 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1992   switch (vlen) {
1993     case 2:
1994       assert(vtmp2 == xnoreg, "");
1995       reduce2D(opcode, dst, src, vtmp1);
1996       break;
1997     case 4:
1998       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1999       break;
2000     case 8:
2001       reduce8D(opcode, dst, src, vtmp1, vtmp2);
2002       break;
2003     default: assert(false, "wrong vector length");
2004   }
2005 }
2006 
2007 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2008   switch (vlen) {
2009     case 2:
2010       assert(vtmp1 == xnoreg, "");
2011       assert(vtmp2 == xnoreg, "");
2012       unorderedReduce2F(opcode, dst, src);
2013       break;
2014     case 4:
2015       assert(vtmp2 == xnoreg, "");
2016       unorderedReduce4F(opcode, dst, src, vtmp1);
2017       break;
2018     case 8:
2019       unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
2020       break;
2021     case 16:
2022       unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
2023       break;
2024     default: assert(false, "wrong vector length");
2025   }
2026 }
2027 
2028 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2029   switch (vlen) {
2030     case 2:
2031       assert(vtmp1 == xnoreg, "");
2032       assert(vtmp2 == xnoreg, "");
2033       unorderedReduce2D(opcode, dst, src);
2034       break;
2035     case 4:
2036       assert(vtmp2 == xnoreg, "");
2037       unorderedReduce4D(opcode, dst, src, vtmp1);
2038       break;
2039     case 8:
2040       unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2041       break;
2042     default: assert(false, "wrong vector length");
2043   }
2044 }
2045 
2046 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2047   if (opcode == Op_AddReductionVI) {
2048     if (vtmp1 != src2) {
2049       movdqu(vtmp1, src2);
2050     }
2051     phaddd(vtmp1, vtmp1);
2052   } else {
2053     pshufd(vtmp1, src2, 0x1);
2054     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2055   }
2056   movdl(vtmp2, src1);
2057   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2058   movdl(dst, vtmp1);
2059 }
2060 
2061 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2062   if (opcode == Op_AddReductionVI) {
2063     if (vtmp1 != src2) {
2064       movdqu(vtmp1, src2);
2065     }
2066     phaddd(vtmp1, src2);
2067     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2068   } else {
2069     pshufd(vtmp2, src2, 0xE);
2070     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2071     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2072   }
2073 }
2074 
2075 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2076   if (opcode == Op_AddReductionVI) {
2077     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2078     vextracti128_high(vtmp2, vtmp1);
2079     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2080     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2081   } else {
2082     vextracti128_high(vtmp1, src2);
2083     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2084     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2085   }
2086 }
2087 
2088 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2089   vextracti64x4_high(vtmp2, src2);
2090   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2091   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2092 }
2093 
2094 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2095   pshufd(vtmp2, src2, 0x1);
2096   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2097   movdqu(vtmp1, vtmp2);
2098   psrldq(vtmp1, 2);
2099   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2100   movdqu(vtmp2, vtmp1);
2101   psrldq(vtmp2, 1);
2102   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2103   movdl(vtmp2, src1);
2104   if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) {
2105     pmovzxbd(vtmp1, vtmp1);
2106   } else {
2107     pmovsxbd(vtmp1, vtmp1);
2108   }
2109   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2110   pextrb(dst, vtmp1, 0x0);
2111   movsbl(dst, dst);
2112 }
2113 
2114 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2115   pshufd(vtmp1, src2, 0xE);
2116   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2117   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2118 }
2119 
2120 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2121   vextracti128_high(vtmp2, src2);
2122   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2123   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2124 }
2125 
2126 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2127   vextracti64x4_high(vtmp1, src2);
2128   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2129   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2130 }
2131 
2132 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2133   pmovsxbw(vtmp2, src2);
2134   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2135 }
2136 
2137 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2138   if (UseAVX > 1) {
2139     int vector_len = Assembler::AVX_256bit;
2140     vpmovsxbw(vtmp1, src2, vector_len);
2141     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2142   } else {
2143     pmovsxbw(vtmp2, src2);
2144     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2145     pshufd(vtmp2, src2, 0x1);
2146     pmovsxbw(vtmp2, src2);
2147     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2148   }
2149 }
2150 
2151 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2152   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2153     int vector_len = Assembler::AVX_512bit;
2154     vpmovsxbw(vtmp1, src2, vector_len);
2155     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2156   } else {
2157     assert(UseAVX >= 2,"Should not reach here.");
2158     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2159     vextracti128_high(vtmp2, src2);
2160     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2161   }
2162 }
2163 
2164 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2165   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2166   vextracti64x4_high(vtmp2, src2);
2167   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2168 }
2169 
2170 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2171   if (opcode == Op_AddReductionVI) {
2172     if (vtmp1 != src2) {
2173       movdqu(vtmp1, src2);
2174     }
2175     phaddw(vtmp1, vtmp1);
2176     phaddw(vtmp1, vtmp1);
2177   } else {
2178     pshufd(vtmp2, src2, 0x1);
2179     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2180     movdqu(vtmp1, vtmp2);
2181     psrldq(vtmp1, 2);
2182     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2183   }
2184   movdl(vtmp2, src1);
2185   if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) {
2186     pmovzxwd(vtmp1, vtmp1);
2187   } else {
2188     pmovsxwd(vtmp1, vtmp1);
2189   }
2190   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2191   pextrw(dst, vtmp1, 0x0);
2192   movswl(dst, dst);
2193 }
2194 
2195 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2196   if (opcode == Op_AddReductionVI) {
2197     if (vtmp1 != src2) {
2198       movdqu(vtmp1, src2);
2199     }
2200     phaddw(vtmp1, src2);
2201   } else {
2202     pshufd(vtmp1, src2, 0xE);
2203     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2204   }
2205   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2206 }
2207 
2208 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2209   if (opcode == Op_AddReductionVI) {
2210     int vector_len = Assembler::AVX_256bit;
2211     vphaddw(vtmp2, src2, src2, vector_len);
2212     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2213   } else {
2214     vextracti128_high(vtmp2, src2);
2215     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2216   }
2217   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2218 }
2219 
2220 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2221   int vector_len = Assembler::AVX_256bit;
2222   vextracti64x4_high(vtmp1, src2);
2223   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2224   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2225 }
2226 
2227 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2228   pshufd(vtmp2, src2, 0xE);
2229   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2230   movdq(vtmp1, src1);
2231   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2232   movdq(dst, vtmp1);
2233 }
2234 
2235 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2236   vextracti128_high(vtmp1, src2);
2237   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2238   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2239 }
2240 
2241 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2242   vextracti64x4_high(vtmp2, src2);
2243   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2244   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2245 }
2246 
2247 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2248   mov64(temp, -1L);
2249   bzhiq(temp, temp, len);
2250   kmovql(dst, temp);
2251 }
2252 
2253 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2254   reduce_operation_128(T_FLOAT, opcode, dst, src);
2255   pshufd(vtmp, src, 0x1);
2256   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2257 }
2258 
2259 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2260   reduce2F(opcode, dst, src, vtmp);
2261   pshufd(vtmp, src, 0x2);
2262   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2263   pshufd(vtmp, src, 0x3);
2264   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2265 }
2266 
2267 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2268   reduce4F(opcode, dst, src, vtmp2);
2269   vextractf128_high(vtmp2, src);
2270   reduce4F(opcode, dst, vtmp2, vtmp1);
2271 }
2272 
2273 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2274   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2275   vextracti64x4_high(vtmp1, src);
2276   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2277 }
2278 
2279 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2280   pshufd(dst, src, 0x1);
2281   reduce_operation_128(T_FLOAT, opcode, dst, src);
2282 }
2283 
2284 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2285   pshufd(vtmp, src, 0xE);
2286   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2287   unorderedReduce2F(opcode, dst, vtmp);
2288 }
2289 
2290 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2291   vextractf128_high(vtmp1, src);
2292   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2293   unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2294 }
2295 
2296 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2297   vextractf64x4_high(vtmp2, src);
2298   unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2299   unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2300 }
2301 
2302 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2303   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2304   pshufd(vtmp, src, 0xE);
2305   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2306 }
2307 
2308 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2309   reduce2D(opcode, dst, src, vtmp2);
2310   vextractf128_high(vtmp2, src);
2311   reduce2D(opcode, dst, vtmp2, vtmp1);
2312 }
2313 
2314 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2315   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2316   vextracti64x4_high(vtmp1, src);
2317   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2318 }
2319 
2320 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2321   pshufd(dst, src, 0xE);
2322   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2323 }
2324 
2325 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2326   vextractf128_high(vtmp, src);
2327   unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2328   unorderedReduce2D(opcode, dst, vtmp);
2329 }
2330 
2331 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2332   vextractf64x4_high(vtmp2, src);
2333   unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2334   unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2335 }
2336 
2337 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2338   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2339 }
2340 
2341 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2342   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2343 }
2344 
2345 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2346   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2347 }
2348 
2349 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2350                                  int vec_enc) {
2351   switch(elem_bt) {
2352     case T_INT:
2353     case T_FLOAT:
2354       vmaskmovps(dst, src, mask, vec_enc);
2355       break;
2356     case T_LONG:
2357     case T_DOUBLE:
2358       vmaskmovpd(dst, src, mask, vec_enc);
2359       break;
2360     default:
2361       fatal("Unsupported type %s", type2name(elem_bt));
2362       break;
2363   }
2364 }
2365 
2366 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2367                                  int vec_enc) {
2368   switch(elem_bt) {
2369     case T_INT:
2370     case T_FLOAT:
2371       vmaskmovps(dst, src, mask, vec_enc);
2372       break;
2373     case T_LONG:
2374     case T_DOUBLE:
2375       vmaskmovpd(dst, src, mask, vec_enc);
2376       break;
2377     default:
2378       fatal("Unsupported type %s", type2name(elem_bt));
2379       break;
2380   }
2381 }
2382 
2383 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2384                                           XMMRegister dst, XMMRegister src,
2385                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2386                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2387   const int permconst[] = {1, 14};
2388   XMMRegister wsrc = src;
2389   XMMRegister wdst = xmm_0;
2390   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2391 
2392   int vlen_enc = Assembler::AVX_128bit;
2393   if (vlen == 16) {
2394     vlen_enc = Assembler::AVX_256bit;
2395   }
2396 
2397   for (int i = log2(vlen) - 1; i >=0; i--) {
2398     if (i == 0 && !is_dst_valid) {
2399       wdst = dst;
2400     }
2401     if (i == 3) {
2402       vextracti64x4_high(wtmp, wsrc);
2403     } else if (i == 2) {
2404       vextracti128_high(wtmp, wsrc);
2405     } else { // i = [0,1]
2406       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2407     }
2408 
2409     if (VM_Version::supports_avx10_2()) {
2410       vminmax_fp(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc);
2411     } else {
2412       vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2413     }
2414     wsrc = wdst;
2415     vlen_enc = Assembler::AVX_128bit;
2416   }
2417   if (is_dst_valid) {
2418     if (VM_Version::supports_avx10_2()) {
2419       vminmax_fp(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit);
2420     } else {
2421       vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2422     }
2423   }
2424 }
2425 
2426 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2427                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2428                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2429   XMMRegister wsrc = src;
2430   XMMRegister wdst = xmm_0;
2431   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2432   int vlen_enc = Assembler::AVX_128bit;
2433   if (vlen == 8) {
2434     vlen_enc = Assembler::AVX_256bit;
2435   }
2436   for (int i = log2(vlen) - 1; i >=0; i--) {
2437     if (i == 0 && !is_dst_valid) {
2438       wdst = dst;
2439     }
2440     if (i == 1) {
2441       vextracti128_high(wtmp, wsrc);
2442     } else if (i == 2) {
2443       vextracti64x4_high(wtmp, wsrc);
2444     } else {
2445       assert(i == 0, "%d", i);
2446       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2447     }
2448 
2449     if (VM_Version::supports_avx10_2()) {
2450       vminmax_fp(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc);
2451     } else {
2452       vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2453     }
2454 
2455     wsrc = wdst;
2456     vlen_enc = Assembler::AVX_128bit;
2457   }
2458 
2459   if (is_dst_valid) {
2460     if (VM_Version::supports_avx10_2()) {
2461       vminmax_fp(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit);
2462     } else {
2463       vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2464     }
2465   }
2466 }
2467 
2468 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2469   switch (bt) {
2470     case T_BYTE:  pextrb(dst, src, idx); break;
2471     case T_SHORT: pextrw(dst, src, idx); break;
2472     case T_INT:   pextrd(dst, src, idx); break;
2473     case T_LONG:  pextrq(dst, src, idx); break;
2474 
2475     default:
2476       assert(false,"Should not reach here.");
2477       break;
2478   }
2479 }
2480 
2481 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2482   int esize =  type2aelembytes(typ);
2483   int elem_per_lane = 16/esize;
2484   int lane = elemindex / elem_per_lane;
2485   int eindex = elemindex % elem_per_lane;
2486 
2487   if (lane >= 2) {
2488     assert(UseAVX > 2, "required");
2489     vextractf32x4(dst, src, lane & 3);
2490     return dst;
2491   } else if (lane > 0) {
2492     assert(UseAVX > 0, "required");
2493     vextractf128(dst, src, lane);
2494     return dst;
2495   } else {
2496     return src;
2497   }
2498 }
2499 
2500 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2501   if (typ == T_BYTE) {
2502     movsbl(dst, dst);
2503   } else if (typ == T_SHORT) {
2504     movswl(dst, dst);
2505   }
2506 }
2507 
2508 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2509   int esize =  type2aelembytes(typ);
2510   int elem_per_lane = 16/esize;
2511   int eindex = elemindex % elem_per_lane;
2512   assert(is_integral_type(typ),"required");
2513 
2514   if (eindex == 0) {
2515     if (typ == T_LONG) {
2516       movq(dst, src);
2517     } else {
2518       movdl(dst, src);
2519       movsxl(typ, dst);
2520     }
2521   } else {
2522     extract(typ, dst, src, eindex);
2523     movsxl(typ, dst);
2524   }
2525 }
2526 
2527 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2528   int esize =  type2aelembytes(typ);
2529   int elem_per_lane = 16/esize;
2530   int eindex = elemindex % elem_per_lane;
2531   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2532 
2533   if (eindex == 0) {
2534     movq(dst, src);
2535   } else {
2536     if (typ == T_FLOAT) {
2537       if (UseAVX == 0) {
2538         movdqu(dst, src);
2539         shufps(dst, dst, eindex);
2540       } else {
2541         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2542       }
2543     } else {
2544       if (UseAVX == 0) {
2545         movdqu(dst, src);
2546         psrldq(dst, eindex*esize);
2547       } else {
2548         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2549       }
2550       movq(dst, dst);
2551     }
2552   }
2553   // Zero upper bits
2554   if (typ == T_FLOAT) {
2555     if (UseAVX == 0) {
2556       assert(vtmp != xnoreg, "required.");
2557       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2558       pand(dst, vtmp);
2559     } else {
2560       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2561     }
2562   }
2563 }
2564 
2565 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2566   switch(typ) {
2567     case T_BYTE:
2568     case T_BOOLEAN:
2569       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2570       break;
2571     case T_SHORT:
2572     case T_CHAR:
2573       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2574       break;
2575     case T_INT:
2576     case T_FLOAT:
2577       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2578       break;
2579     case T_LONG:
2580     case T_DOUBLE:
2581       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2582       break;
2583     default:
2584       assert(false,"Should not reach here.");
2585       break;
2586   }
2587 }
2588 
2589 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2590   assert(rscratch != noreg || always_reachable(src2), "missing");
2591 
2592   switch(typ) {
2593     case T_BOOLEAN:
2594     case T_BYTE:
2595       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2596       break;
2597     case T_CHAR:
2598     case T_SHORT:
2599       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2600       break;
2601     case T_INT:
2602     case T_FLOAT:
2603       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2604       break;
2605     case T_LONG:
2606     case T_DOUBLE:
2607       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2608       break;
2609     default:
2610       assert(false,"Should not reach here.");
2611       break;
2612   }
2613 }
2614 
2615 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2616   switch(typ) {
2617     case T_BYTE:
2618       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2619       break;
2620     case T_SHORT:
2621       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2622       break;
2623     case T_INT:
2624     case T_FLOAT:
2625       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2626       break;
2627     case T_LONG:
2628     case T_DOUBLE:
2629       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2630       break;
2631     default:
2632       assert(false,"Should not reach here.");
2633       break;
2634   }
2635 }
2636 
2637 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2638   assert(vlen_in_bytes <= 32, "");
2639   int esize = type2aelembytes(bt);
2640   if (vlen_in_bytes == 32) {
2641     assert(vtmp == xnoreg, "required.");
2642     if (esize >= 4) {
2643       vtestps(src1, src2, AVX_256bit);
2644     } else {
2645       vptest(src1, src2, AVX_256bit);
2646     }
2647     return;
2648   }
2649   if (vlen_in_bytes < 16) {
2650     // Duplicate the lower part to fill the whole register,
2651     // Don't need to do so for src2
2652     assert(vtmp != xnoreg, "required");
2653     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2654     pshufd(vtmp, src1, shuffle_imm);
2655   } else {
2656     assert(vtmp == xnoreg, "required");
2657     vtmp = src1;
2658   }
2659   if (esize >= 4 && VM_Version::supports_avx()) {
2660     vtestps(vtmp, src2, AVX_128bit);
2661   } else {
2662     ptest(vtmp, src2);
2663   }
2664 }
2665 
2666 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2667 #ifdef ASSERT
2668   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2669   bool is_bw_supported = VM_Version::supports_avx512bw();
2670   if (is_bw && !is_bw_supported) {
2671     assert(vlen_enc != Assembler::AVX_512bit, "required");
2672     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2673            "XMM register should be 0-15");
2674   }
2675 #endif // ASSERT
2676   switch (elem_bt) {
2677     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2678     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2679     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2680     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2681     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2682     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2683     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2684   }
2685 }
2686 
2687 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2688   assert(UseAVX >= 2, "required");
2689   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2690   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2691   if ((UseAVX > 2) &&
2692       (!is_bw || VM_Version::supports_avx512bw()) &&
2693       (!is_vl || VM_Version::supports_avx512vl())) {
2694     switch (elem_bt) {
2695       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2696       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2697       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2698       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2699       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2700     }
2701   } else {
2702     assert(vlen_enc != Assembler::AVX_512bit, "required");
2703     assert((dst->encoding() < 16),"XMM register should be 0-15");
2704     switch (elem_bt) {
2705       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2706       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2707       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2708       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2709       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2710       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2711       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2712     }
2713   }
2714 }
2715 
2716 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2717   switch (to_elem_bt) {
2718     case T_SHORT:
2719       vpmovsxbw(dst, src, vlen_enc);
2720       break;
2721     case T_INT:
2722       vpmovsxbd(dst, src, vlen_enc);
2723       break;
2724     case T_FLOAT:
2725       vpmovsxbd(dst, src, vlen_enc);
2726       vcvtdq2ps(dst, dst, vlen_enc);
2727       break;
2728     case T_LONG:
2729       vpmovsxbq(dst, src, vlen_enc);
2730       break;
2731     case T_DOUBLE: {
2732       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2733       vpmovsxbd(dst, src, mid_vlen_enc);
2734       vcvtdq2pd(dst, dst, vlen_enc);
2735       break;
2736     }
2737     default:
2738       fatal("Unsupported type %s", type2name(to_elem_bt));
2739       break;
2740   }
2741 }
2742 
2743 //-------------------------------------------------------------------------------------------
2744 
2745 // IndexOf for constant substrings with size >= 8 chars
2746 // which don't need to be loaded through stack.
2747 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2748                                          Register cnt1, Register cnt2,
2749                                          int int_cnt2,  Register result,
2750                                          XMMRegister vec, Register tmp,
2751                                          int ae) {
2752   ShortBranchVerifier sbv(this);
2753   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2754   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2755 
2756   // This method uses the pcmpestri instruction with bound registers
2757   //   inputs:
2758   //     xmm - substring
2759   //     rax - substring length (elements count)
2760   //     mem - scanned string
2761   //     rdx - string length (elements count)
2762   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2763   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2764   //   outputs:
2765   //     rcx - matched index in string
2766   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2767   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2768   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2769   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2770   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2771 
2772   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2773         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2774         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2775 
2776   // Note, inline_string_indexOf() generates checks:
2777   // if (substr.count > string.count) return -1;
2778   // if (substr.count == 0) return 0;
2779   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2780 
2781   // Load substring.
2782   if (ae == StrIntrinsicNode::UL) {
2783     pmovzxbw(vec, Address(str2, 0));
2784   } else {
2785     movdqu(vec, Address(str2, 0));
2786   }
2787   movl(cnt2, int_cnt2);
2788   movptr(result, str1); // string addr
2789 
2790   if (int_cnt2 > stride) {
2791     jmpb(SCAN_TO_SUBSTR);
2792 
2793     // Reload substr for rescan, this code
2794     // is executed only for large substrings (> 8 chars)
2795     bind(RELOAD_SUBSTR);
2796     if (ae == StrIntrinsicNode::UL) {
2797       pmovzxbw(vec, Address(str2, 0));
2798     } else {
2799       movdqu(vec, Address(str2, 0));
2800     }
2801     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2802 
2803     bind(RELOAD_STR);
2804     // We came here after the beginning of the substring was
2805     // matched but the rest of it was not so we need to search
2806     // again. Start from the next element after the previous match.
2807 
2808     // cnt2 is number of substring reminding elements and
2809     // cnt1 is number of string reminding elements when cmp failed.
2810     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2811     subl(cnt1, cnt2);
2812     addl(cnt1, int_cnt2);
2813     movl(cnt2, int_cnt2); // Now restore cnt2
2814 
2815     decrementl(cnt1);     // Shift to next element
2816     cmpl(cnt1, cnt2);
2817     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2818 
2819     addptr(result, (1<<scale1));
2820 
2821   } // (int_cnt2 > 8)
2822 
2823   // Scan string for start of substr in 16-byte vectors
2824   bind(SCAN_TO_SUBSTR);
2825   pcmpestri(vec, Address(result, 0), mode);
2826   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2827   subl(cnt1, stride);
2828   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2829   cmpl(cnt1, cnt2);
2830   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2831   addptr(result, 16);
2832   jmpb(SCAN_TO_SUBSTR);
2833 
2834   // Found a potential substr
2835   bind(FOUND_CANDIDATE);
2836   // Matched whole vector if first element matched (tmp(rcx) == 0).
2837   if (int_cnt2 == stride) {
2838     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2839   } else { // int_cnt2 > 8
2840     jccb(Assembler::overflow, FOUND_SUBSTR);
2841   }
2842   // After pcmpestri tmp(rcx) contains matched element index
2843   // Compute start addr of substr
2844   lea(result, Address(result, tmp, scale1));
2845 
2846   // Make sure string is still long enough
2847   subl(cnt1, tmp);
2848   cmpl(cnt1, cnt2);
2849   if (int_cnt2 == stride) {
2850     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2851   } else { // int_cnt2 > 8
2852     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2853   }
2854   // Left less then substring.
2855 
2856   bind(RET_NOT_FOUND);
2857   movl(result, -1);
2858   jmp(EXIT);
2859 
2860   if (int_cnt2 > stride) {
2861     // This code is optimized for the case when whole substring
2862     // is matched if its head is matched.
2863     bind(MATCH_SUBSTR_HEAD);
2864     pcmpestri(vec, Address(result, 0), mode);
2865     // Reload only string if does not match
2866     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2867 
2868     Label CONT_SCAN_SUBSTR;
2869     // Compare the rest of substring (> 8 chars).
2870     bind(FOUND_SUBSTR);
2871     // First 8 chars are already matched.
2872     negptr(cnt2);
2873     addptr(cnt2, stride);
2874 
2875     bind(SCAN_SUBSTR);
2876     subl(cnt1, stride);
2877     cmpl(cnt2, -stride); // Do not read beyond substring
2878     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2879     // Back-up strings to avoid reading beyond substring:
2880     // cnt1 = cnt1 - cnt2 + 8
2881     addl(cnt1, cnt2); // cnt2 is negative
2882     addl(cnt1, stride);
2883     movl(cnt2, stride); negptr(cnt2);
2884     bind(CONT_SCAN_SUBSTR);
2885     if (int_cnt2 < (int)G) {
2886       int tail_off1 = int_cnt2<<scale1;
2887       int tail_off2 = int_cnt2<<scale2;
2888       if (ae == StrIntrinsicNode::UL) {
2889         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2890       } else {
2891         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2892       }
2893       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2894     } else {
2895       // calculate index in register to avoid integer overflow (int_cnt2*2)
2896       movl(tmp, int_cnt2);
2897       addptr(tmp, cnt2);
2898       if (ae == StrIntrinsicNode::UL) {
2899         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2900       } else {
2901         movdqu(vec, Address(str2, tmp, scale2, 0));
2902       }
2903       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2904     }
2905     // Need to reload strings pointers if not matched whole vector
2906     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2907     addptr(cnt2, stride);
2908     jcc(Assembler::negative, SCAN_SUBSTR);
2909     // Fall through if found full substring
2910 
2911   } // (int_cnt2 > 8)
2912 
2913   bind(RET_FOUND);
2914   // Found result if we matched full small substring.
2915   // Compute substr offset
2916   subptr(result, str1);
2917   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2918     shrl(result, 1); // index
2919   }
2920   bind(EXIT);
2921 
2922 } // string_indexofC8
2923 
2924 // Small strings are loaded through stack if they cross page boundary.
2925 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2926                                        Register cnt1, Register cnt2,
2927                                        int int_cnt2,  Register result,
2928                                        XMMRegister vec, Register tmp,
2929                                        int ae) {
2930   ShortBranchVerifier sbv(this);
2931   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2932   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2933 
2934   //
2935   // int_cnt2 is length of small (< 8 chars) constant substring
2936   // or (-1) for non constant substring in which case its length
2937   // is in cnt2 register.
2938   //
2939   // Note, inline_string_indexOf() generates checks:
2940   // if (substr.count > string.count) return -1;
2941   // if (substr.count == 0) return 0;
2942   //
2943   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2944   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2945   // This method uses the pcmpestri instruction with bound registers
2946   //   inputs:
2947   //     xmm - substring
2948   //     rax - substring length (elements count)
2949   //     mem - scanned string
2950   //     rdx - string length (elements count)
2951   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2952   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2953   //   outputs:
2954   //     rcx - matched index in string
2955   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2956   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2957   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2958   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2959 
2960   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2961         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2962         FOUND_CANDIDATE;
2963 
2964   { //========================================================
2965     // We don't know where these strings are located
2966     // and we can't read beyond them. Load them through stack.
2967     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2968 
2969     movptr(tmp, rsp); // save old SP
2970 
2971     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2972       if (int_cnt2 == (1>>scale2)) { // One byte
2973         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2974         load_unsigned_byte(result, Address(str2, 0));
2975         movdl(vec, result); // move 32 bits
2976       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2977         // Not enough header space in 32-bit VM: 12+3 = 15.
2978         movl(result, Address(str2, -1));
2979         shrl(result, 8);
2980         movdl(vec, result); // move 32 bits
2981       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2982         load_unsigned_short(result, Address(str2, 0));
2983         movdl(vec, result); // move 32 bits
2984       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2985         movdl(vec, Address(str2, 0)); // move 32 bits
2986       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2987         movq(vec, Address(str2, 0));  // move 64 bits
2988       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2989         // Array header size is 12 bytes in 32-bit VM
2990         // + 6 bytes for 3 chars == 18 bytes,
2991         // enough space to load vec and shift.
2992         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2993         if (ae == StrIntrinsicNode::UL) {
2994           int tail_off = int_cnt2-8;
2995           pmovzxbw(vec, Address(str2, tail_off));
2996           psrldq(vec, -2*tail_off);
2997         }
2998         else {
2999           int tail_off = int_cnt2*(1<<scale2);
3000           movdqu(vec, Address(str2, tail_off-16));
3001           psrldq(vec, 16-tail_off);
3002         }
3003       }
3004     } else { // not constant substring
3005       cmpl(cnt2, stride);
3006       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3007 
3008       // We can read beyond string if srt+16 does not cross page boundary
3009       // since heaps are aligned and mapped by pages.
3010       assert(os::vm_page_size() < (int)G, "default page should be small");
3011       movl(result, str2); // We need only low 32 bits
3012       andl(result, ((int)os::vm_page_size()-1));
3013       cmpl(result, ((int)os::vm_page_size()-16));
3014       jccb(Assembler::belowEqual, CHECK_STR);
3015 
3016       // Move small strings to stack to allow load 16 bytes into vec.
3017       subptr(rsp, 16);
3018       int stk_offset = wordSize-(1<<scale2);
3019       push(cnt2);
3020 
3021       bind(COPY_SUBSTR);
3022       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3023         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3024         movb(Address(rsp, cnt2, scale2, stk_offset), result);
3025       } else if (ae == StrIntrinsicNode::UU) {
3026         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3027         movw(Address(rsp, cnt2, scale2, stk_offset), result);
3028       }
3029       decrement(cnt2);
3030       jccb(Assembler::notZero, COPY_SUBSTR);
3031 
3032       pop(cnt2);
3033       movptr(str2, rsp);  // New substring address
3034     } // non constant
3035 
3036     bind(CHECK_STR);
3037     cmpl(cnt1, stride);
3038     jccb(Assembler::aboveEqual, BIG_STRINGS);
3039 
3040     // Check cross page boundary.
3041     movl(result, str1); // We need only low 32 bits
3042     andl(result, ((int)os::vm_page_size()-1));
3043     cmpl(result, ((int)os::vm_page_size()-16));
3044     jccb(Assembler::belowEqual, BIG_STRINGS);
3045 
3046     subptr(rsp, 16);
3047     int stk_offset = -(1<<scale1);
3048     if (int_cnt2 < 0) { // not constant
3049       push(cnt2);
3050       stk_offset += wordSize;
3051     }
3052     movl(cnt2, cnt1);
3053 
3054     bind(COPY_STR);
3055     if (ae == StrIntrinsicNode::LL) {
3056       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3057       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3058     } else {
3059       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3060       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3061     }
3062     decrement(cnt2);
3063     jccb(Assembler::notZero, COPY_STR);
3064 
3065     if (int_cnt2 < 0) { // not constant
3066       pop(cnt2);
3067     }
3068     movptr(str1, rsp);  // New string address
3069 
3070     bind(BIG_STRINGS);
3071     // Load substring.
3072     if (int_cnt2 < 0) { // -1
3073       if (ae == StrIntrinsicNode::UL) {
3074         pmovzxbw(vec, Address(str2, 0));
3075       } else {
3076         movdqu(vec, Address(str2, 0));
3077       }
3078       push(cnt2);       // substr count
3079       push(str2);       // substr addr
3080       push(str1);       // string addr
3081     } else {
3082       // Small (< 8 chars) constant substrings are loaded already.
3083       movl(cnt2, int_cnt2);
3084     }
3085     push(tmp);  // original SP
3086 
3087   } // Finished loading
3088 
3089   //========================================================
3090   // Start search
3091   //
3092 
3093   movptr(result, str1); // string addr
3094 
3095   if (int_cnt2  < 0) {  // Only for non constant substring
3096     jmpb(SCAN_TO_SUBSTR);
3097 
3098     // SP saved at sp+0
3099     // String saved at sp+1*wordSize
3100     // Substr saved at sp+2*wordSize
3101     // Substr count saved at sp+3*wordSize
3102 
3103     // Reload substr for rescan, this code
3104     // is executed only for large substrings (> 8 chars)
3105     bind(RELOAD_SUBSTR);
3106     movptr(str2, Address(rsp, 2*wordSize));
3107     movl(cnt2, Address(rsp, 3*wordSize));
3108     if (ae == StrIntrinsicNode::UL) {
3109       pmovzxbw(vec, Address(str2, 0));
3110     } else {
3111       movdqu(vec, Address(str2, 0));
3112     }
3113     // We came here after the beginning of the substring was
3114     // matched but the rest of it was not so we need to search
3115     // again. Start from the next element after the previous match.
3116     subptr(str1, result); // Restore counter
3117     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3118       shrl(str1, 1);
3119     }
3120     addl(cnt1, str1);
3121     decrementl(cnt1);   // Shift to next element
3122     cmpl(cnt1, cnt2);
3123     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3124 
3125     addptr(result, (1<<scale1));
3126   } // non constant
3127 
3128   // Scan string for start of substr in 16-byte vectors
3129   bind(SCAN_TO_SUBSTR);
3130   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3131   pcmpestri(vec, Address(result, 0), mode);
3132   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3133   subl(cnt1, stride);
3134   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3135   cmpl(cnt1, cnt2);
3136   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3137   addptr(result, 16);
3138 
3139   bind(ADJUST_STR);
3140   cmpl(cnt1, stride); // Do not read beyond string
3141   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3142   // Back-up string to avoid reading beyond string.
3143   lea(result, Address(result, cnt1, scale1, -16));
3144   movl(cnt1, stride);
3145   jmpb(SCAN_TO_SUBSTR);
3146 
3147   // Found a potential substr
3148   bind(FOUND_CANDIDATE);
3149   // After pcmpestri tmp(rcx) contains matched element index
3150 
3151   // Make sure string is still long enough
3152   subl(cnt1, tmp);
3153   cmpl(cnt1, cnt2);
3154   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3155   // Left less then substring.
3156 
3157   bind(RET_NOT_FOUND);
3158   movl(result, -1);
3159   jmp(CLEANUP);
3160 
3161   bind(FOUND_SUBSTR);
3162   // Compute start addr of substr
3163   lea(result, Address(result, tmp, scale1));
3164   if (int_cnt2 > 0) { // Constant substring
3165     // Repeat search for small substring (< 8 chars)
3166     // from new point without reloading substring.
3167     // Have to check that we don't read beyond string.
3168     cmpl(tmp, stride-int_cnt2);
3169     jccb(Assembler::greater, ADJUST_STR);
3170     // Fall through if matched whole substring.
3171   } else { // non constant
3172     assert(int_cnt2 == -1, "should be != 0");
3173 
3174     addl(tmp, cnt2);
3175     // Found result if we matched whole substring.
3176     cmpl(tmp, stride);
3177     jcc(Assembler::lessEqual, RET_FOUND);
3178 
3179     // Repeat search for small substring (<= 8 chars)
3180     // from new point 'str1' without reloading substring.
3181     cmpl(cnt2, stride);
3182     // Have to check that we don't read beyond string.
3183     jccb(Assembler::lessEqual, ADJUST_STR);
3184 
3185     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3186     // Compare the rest of substring (> 8 chars).
3187     movptr(str1, result);
3188 
3189     cmpl(tmp, cnt2);
3190     // First 8 chars are already matched.
3191     jccb(Assembler::equal, CHECK_NEXT);
3192 
3193     bind(SCAN_SUBSTR);
3194     pcmpestri(vec, Address(str1, 0), mode);
3195     // Need to reload strings pointers if not matched whole vector
3196     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3197 
3198     bind(CHECK_NEXT);
3199     subl(cnt2, stride);
3200     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3201     addptr(str1, 16);
3202     if (ae == StrIntrinsicNode::UL) {
3203       addptr(str2, 8);
3204     } else {
3205       addptr(str2, 16);
3206     }
3207     subl(cnt1, stride);
3208     cmpl(cnt2, stride); // Do not read beyond substring
3209     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3210     // Back-up strings to avoid reading beyond substring.
3211 
3212     if (ae == StrIntrinsicNode::UL) {
3213       lea(str2, Address(str2, cnt2, scale2, -8));
3214       lea(str1, Address(str1, cnt2, scale1, -16));
3215     } else {
3216       lea(str2, Address(str2, cnt2, scale2, -16));
3217       lea(str1, Address(str1, cnt2, scale1, -16));
3218     }
3219     subl(cnt1, cnt2);
3220     movl(cnt2, stride);
3221     addl(cnt1, stride);
3222     bind(CONT_SCAN_SUBSTR);
3223     if (ae == StrIntrinsicNode::UL) {
3224       pmovzxbw(vec, Address(str2, 0));
3225     } else {
3226       movdqu(vec, Address(str2, 0));
3227     }
3228     jmp(SCAN_SUBSTR);
3229 
3230     bind(RET_FOUND_LONG);
3231     movptr(str1, Address(rsp, wordSize));
3232   } // non constant
3233 
3234   bind(RET_FOUND);
3235   // Compute substr offset
3236   subptr(result, str1);
3237   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3238     shrl(result, 1); // index
3239   }
3240   bind(CLEANUP);
3241   pop(rsp); // restore SP
3242 
3243 } // string_indexof
3244 
3245 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3246                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3247   ShortBranchVerifier sbv(this);
3248   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3249 
3250   int stride = 8;
3251 
3252   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3253         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3254         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3255         FOUND_SEQ_CHAR, DONE_LABEL;
3256 
3257   movptr(result, str1);
3258   if (UseAVX >= 2) {
3259     cmpl(cnt1, stride);
3260     jcc(Assembler::less, SCAN_TO_CHAR);
3261     cmpl(cnt1, 2*stride);
3262     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3263     movdl(vec1, ch);
3264     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3265     vpxor(vec2, vec2);
3266     movl(tmp, cnt1);
3267     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3268     andl(cnt1,0x0000000F);  //tail count (in chars)
3269 
3270     bind(SCAN_TO_16_CHAR_LOOP);
3271     vmovdqu(vec3, Address(result, 0));
3272     vpcmpeqw(vec3, vec3, vec1, 1);
3273     vptest(vec2, vec3);
3274     jcc(Assembler::carryClear, FOUND_CHAR);
3275     addptr(result, 32);
3276     subl(tmp, 2*stride);
3277     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3278     jmp(SCAN_TO_8_CHAR);
3279     bind(SCAN_TO_8_CHAR_INIT);
3280     movdl(vec1, ch);
3281     pshuflw(vec1, vec1, 0x00);
3282     pshufd(vec1, vec1, 0);
3283     pxor(vec2, vec2);
3284   }
3285   bind(SCAN_TO_8_CHAR);
3286   cmpl(cnt1, stride);
3287   jcc(Assembler::less, SCAN_TO_CHAR);
3288   if (UseAVX < 2) {
3289     movdl(vec1, ch);
3290     pshuflw(vec1, vec1, 0x00);
3291     pshufd(vec1, vec1, 0);
3292     pxor(vec2, vec2);
3293   }
3294   movl(tmp, cnt1);
3295   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3296   andl(cnt1,0x00000007);  //tail count (in chars)
3297 
3298   bind(SCAN_TO_8_CHAR_LOOP);
3299   movdqu(vec3, Address(result, 0));
3300   pcmpeqw(vec3, vec1);
3301   ptest(vec2, vec3);
3302   jcc(Assembler::carryClear, FOUND_CHAR);
3303   addptr(result, 16);
3304   subl(tmp, stride);
3305   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3306   bind(SCAN_TO_CHAR);
3307   testl(cnt1, cnt1);
3308   jcc(Assembler::zero, RET_NOT_FOUND);
3309   bind(SCAN_TO_CHAR_LOOP);
3310   load_unsigned_short(tmp, Address(result, 0));
3311   cmpl(ch, tmp);
3312   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3313   addptr(result, 2);
3314   subl(cnt1, 1);
3315   jccb(Assembler::zero, RET_NOT_FOUND);
3316   jmp(SCAN_TO_CHAR_LOOP);
3317 
3318   bind(RET_NOT_FOUND);
3319   movl(result, -1);
3320   jmpb(DONE_LABEL);
3321 
3322   bind(FOUND_CHAR);
3323   if (UseAVX >= 2) {
3324     vpmovmskb(tmp, vec3);
3325   } else {
3326     pmovmskb(tmp, vec3);
3327   }
3328   bsfl(ch, tmp);
3329   addptr(result, ch);
3330 
3331   bind(FOUND_SEQ_CHAR);
3332   subptr(result, str1);
3333   shrl(result, 1);
3334 
3335   bind(DONE_LABEL);
3336 } // string_indexof_char
3337 
3338 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3339                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3340   ShortBranchVerifier sbv(this);
3341   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3342 
3343   int stride = 16;
3344 
3345   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3346         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3347         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3348         FOUND_SEQ_CHAR, DONE_LABEL;
3349 
3350   movptr(result, str1);
3351   if (UseAVX >= 2) {
3352     cmpl(cnt1, stride);
3353     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3354     cmpl(cnt1, stride*2);
3355     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3356     movdl(vec1, ch);
3357     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3358     vpxor(vec2, vec2);
3359     movl(tmp, cnt1);
3360     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3361     andl(cnt1,0x0000001F);  //tail count (in chars)
3362 
3363     bind(SCAN_TO_32_CHAR_LOOP);
3364     vmovdqu(vec3, Address(result, 0));
3365     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3366     vptest(vec2, vec3);
3367     jcc(Assembler::carryClear, FOUND_CHAR);
3368     addptr(result, 32);
3369     subl(tmp, stride*2);
3370     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3371     jmp(SCAN_TO_16_CHAR);
3372 
3373     bind(SCAN_TO_16_CHAR_INIT);
3374     movdl(vec1, ch);
3375     pxor(vec2, vec2);
3376     pshufb(vec1, vec2);
3377   }
3378 
3379   bind(SCAN_TO_16_CHAR);
3380   cmpl(cnt1, stride);
3381   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3382   if (UseAVX < 2) {
3383     movdl(vec1, ch);
3384     pxor(vec2, vec2);
3385     pshufb(vec1, vec2);
3386   }
3387   movl(tmp, cnt1);
3388   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3389   andl(cnt1,0x0000000F);  //tail count (in bytes)
3390 
3391   bind(SCAN_TO_16_CHAR_LOOP);
3392   movdqu(vec3, Address(result, 0));
3393   pcmpeqb(vec3, vec1);
3394   ptest(vec2, vec3);
3395   jcc(Assembler::carryClear, FOUND_CHAR);
3396   addptr(result, 16);
3397   subl(tmp, stride);
3398   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3399 
3400   bind(SCAN_TO_CHAR_INIT);
3401   testl(cnt1, cnt1);
3402   jcc(Assembler::zero, RET_NOT_FOUND);
3403   bind(SCAN_TO_CHAR_LOOP);
3404   load_unsigned_byte(tmp, Address(result, 0));
3405   cmpl(ch, tmp);
3406   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3407   addptr(result, 1);
3408   subl(cnt1, 1);
3409   jccb(Assembler::zero, RET_NOT_FOUND);
3410   jmp(SCAN_TO_CHAR_LOOP);
3411 
3412   bind(RET_NOT_FOUND);
3413   movl(result, -1);
3414   jmpb(DONE_LABEL);
3415 
3416   bind(FOUND_CHAR);
3417   if (UseAVX >= 2) {
3418     vpmovmskb(tmp, vec3);
3419   } else {
3420     pmovmskb(tmp, vec3);
3421   }
3422   bsfl(ch, tmp);
3423   addptr(result, ch);
3424 
3425   bind(FOUND_SEQ_CHAR);
3426   subptr(result, str1);
3427 
3428   bind(DONE_LABEL);
3429 } // stringL_indexof_char
3430 
3431 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3432   switch (eltype) {
3433   case T_BOOLEAN: return sizeof(jboolean);
3434   case T_BYTE:  return sizeof(jbyte);
3435   case T_SHORT: return sizeof(jshort);
3436   case T_CHAR:  return sizeof(jchar);
3437   case T_INT:   return sizeof(jint);
3438   default:
3439     ShouldNotReachHere();
3440     return -1;
3441   }
3442 }
3443 
3444 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3445   switch (eltype) {
3446   // T_BOOLEAN used as surrogate for unsigned byte
3447   case T_BOOLEAN: movzbl(dst, src);   break;
3448   case T_BYTE:    movsbl(dst, src);   break;
3449   case T_SHORT:   movswl(dst, src);   break;
3450   case T_CHAR:    movzwl(dst, src);   break;
3451   case T_INT:     movl(dst, src);     break;
3452   default:
3453     ShouldNotReachHere();
3454   }
3455 }
3456 
3457 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3458   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3459 }
3460 
3461 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3462   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3463 }
3464 
3465 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3466   const int vlen = Assembler::AVX_256bit;
3467   switch (eltype) {
3468   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3469   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3470   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3471   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3472   case T_INT:
3473     // do nothing
3474     break;
3475   default:
3476     ShouldNotReachHere();
3477   }
3478 }
3479 
3480 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3481                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3482                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3483                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3484                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3485                                         BasicType eltype) {
3486   ShortBranchVerifier sbv(this);
3487   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3488   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3489   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3490 
3491   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3492         SHORT_UNROLLED_LOOP_EXIT,
3493         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3494         UNROLLED_VECTOR_LOOP_BEGIN,
3495         END;
3496   switch (eltype) {
3497   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3498   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3499   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3500   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3501   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3502   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3503   }
3504 
3505   // For "renaming" for readibility of the code
3506   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3507                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3508                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3509 
3510   const int elsize = arrays_hashcode_elsize(eltype);
3511 
3512   /*
3513     if (cnt1 >= 2) {
3514       if (cnt1 >= 32) {
3515         UNROLLED VECTOR LOOP
3516       }
3517       UNROLLED SCALAR LOOP
3518     }
3519     SINGLE SCALAR
3520    */
3521 
3522   cmpl(cnt1, 32);
3523   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3524 
3525   // cnt1 >= 32 && generate_vectorized_loop
3526   xorl(index, index);
3527 
3528   // vresult = IntVector.zero(I256);
3529   for (int idx = 0; idx < 4; idx++) {
3530     vpxor(vresult[idx], vresult[idx]);
3531   }
3532   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3533   Register bound = tmp2;
3534   Register next = tmp3;
3535   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3536   movl(next, Address(tmp2, 0));
3537   movdl(vnext, next);
3538   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3539 
3540   // index = 0;
3541   // bound = cnt1 & ~(32 - 1);
3542   movl(bound, cnt1);
3543   andl(bound, ~(32 - 1));
3544   // for (; index < bound; index += 32) {
3545   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3546   // result *= next;
3547   imull(result, next);
3548   // loop fission to upfront the cost of fetching from memory, OOO execution
3549   // can then hopefully do a better job of prefetching
3550   for (int idx = 0; idx < 4; idx++) {
3551     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3552   }
3553   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3554   for (int idx = 0; idx < 4; idx++) {
3555     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3556     arrays_hashcode_elvcast(vtmp[idx], eltype);
3557     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3558   }
3559   // index += 32;
3560   addl(index, 32);
3561   // index < bound;
3562   cmpl(index, bound);
3563   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3564   // }
3565 
3566   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3567   subl(cnt1, bound);
3568   // release bound
3569 
3570   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3571   for (int idx = 0; idx < 4; idx++) {
3572     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3573     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3574     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3575   }
3576   // result += vresult.reduceLanes(ADD);
3577   for (int idx = 0; idx < 4; idx++) {
3578     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3579   }
3580 
3581   // } else if (cnt1 < 32) {
3582 
3583   bind(SHORT_UNROLLED_BEGIN);
3584   // int i = 1;
3585   movl(index, 1);
3586   cmpl(index, cnt1);
3587   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3588 
3589   // for (; i < cnt1 ; i += 2) {
3590   bind(SHORT_UNROLLED_LOOP_BEGIN);
3591   movl(tmp3, 961);
3592   imull(result, tmp3);
3593   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3594   movl(tmp3, tmp2);
3595   shll(tmp3, 5);
3596   subl(tmp3, tmp2);
3597   addl(result, tmp3);
3598   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3599   addl(result, tmp3);
3600   addl(index, 2);
3601   cmpl(index, cnt1);
3602   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3603 
3604   // }
3605   // if (i >= cnt1) {
3606   bind(SHORT_UNROLLED_LOOP_EXIT);
3607   jccb(Assembler::greater, END);
3608   movl(tmp2, result);
3609   shll(result, 5);
3610   subl(result, tmp2);
3611   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3612   addl(result, tmp3);
3613   // }
3614   bind(END);
3615 
3616   BLOCK_COMMENT("} // arrays_hashcode");
3617 
3618 } // arrays_hashcode
3619 
3620 // helper function for string_compare
3621 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3622                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3623                                            Address::ScaleFactor scale2, Register index, int ae) {
3624   if (ae == StrIntrinsicNode::LL) {
3625     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3626     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3627   } else if (ae == StrIntrinsicNode::UU) {
3628     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3629     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3630   } else {
3631     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3632     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3633   }
3634 }
3635 
3636 // Compare strings, used for char[] and byte[].
3637 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3638                                        Register cnt1, Register cnt2, Register result,
3639                                        XMMRegister vec1, int ae, KRegister mask) {
3640   ShortBranchVerifier sbv(this);
3641   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3642   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only AVX3
3643   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3644   int stride2x2 = 0x40;
3645   Address::ScaleFactor scale = Address::no_scale;
3646   Address::ScaleFactor scale1 = Address::no_scale;
3647   Address::ScaleFactor scale2 = Address::no_scale;
3648 
3649   if (ae != StrIntrinsicNode::LL) {
3650     stride2x2 = 0x20;
3651   }
3652 
3653   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3654     shrl(cnt2, 1);
3655   }
3656   // Compute the minimum of the string lengths and the
3657   // difference of the string lengths (stack).
3658   // Do the conditional move stuff
3659   movl(result, cnt1);
3660   subl(cnt1, cnt2);
3661   push(cnt1);
3662   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3663 
3664   // Is the minimum length zero?
3665   testl(cnt2, cnt2);
3666   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3667   if (ae == StrIntrinsicNode::LL) {
3668     // Load first bytes
3669     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3670     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3671   } else if (ae == StrIntrinsicNode::UU) {
3672     // Load first characters
3673     load_unsigned_short(result, Address(str1, 0));
3674     load_unsigned_short(cnt1, Address(str2, 0));
3675   } else {
3676     load_unsigned_byte(result, Address(str1, 0));
3677     load_unsigned_short(cnt1, Address(str2, 0));
3678   }
3679   subl(result, cnt1);
3680   jcc(Assembler::notZero,  POP_LABEL);
3681 
3682   if (ae == StrIntrinsicNode::UU) {
3683     // Divide length by 2 to get number of chars
3684     shrl(cnt2, 1);
3685   }
3686   cmpl(cnt2, 1);
3687   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3688 
3689   // Check if the strings start at the same location and setup scale and stride
3690   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3691     cmpptr(str1, str2);
3692     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3693     if (ae == StrIntrinsicNode::LL) {
3694       scale = Address::times_1;
3695       stride = 16;
3696     } else {
3697       scale = Address::times_2;
3698       stride = 8;
3699     }
3700   } else {
3701     scale1 = Address::times_1;
3702     scale2 = Address::times_2;
3703     // scale not used
3704     stride = 8;
3705   }
3706 
3707   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3708     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3709     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3710     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3711     Label COMPARE_TAIL_LONG;
3712     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only AVX3
3713 
3714     int pcmpmask = 0x19;
3715     if (ae == StrIntrinsicNode::LL) {
3716       pcmpmask &= ~0x01;
3717     }
3718 
3719     // Setup to compare 16-chars (32-bytes) vectors,
3720     // start from first character again because it has aligned address.
3721     if (ae == StrIntrinsicNode::LL) {
3722       stride2 = 32;
3723     } else {
3724       stride2 = 16;
3725     }
3726     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3727       adr_stride = stride << scale;
3728     } else {
3729       adr_stride1 = 8;  //stride << scale1;
3730       adr_stride2 = 16; //stride << scale2;
3731     }
3732 
3733     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3734     // rax and rdx are used by pcmpestri as elements counters
3735     movl(result, cnt2);
3736     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3737     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3738 
3739     // fast path : compare first 2 8-char vectors.
3740     bind(COMPARE_16_CHARS);
3741     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3742       movdqu(vec1, Address(str1, 0));
3743     } else {
3744       pmovzxbw(vec1, Address(str1, 0));
3745     }
3746     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3747     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3748 
3749     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3750       movdqu(vec1, Address(str1, adr_stride));
3751       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3752     } else {
3753       pmovzxbw(vec1, Address(str1, adr_stride1));
3754       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3755     }
3756     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3757     addl(cnt1, stride);
3758 
3759     // Compare the characters at index in cnt1
3760     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3761     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3762     subl(result, cnt2);
3763     jmp(POP_LABEL);
3764 
3765     // Setup the registers to start vector comparison loop
3766     bind(COMPARE_WIDE_VECTORS);
3767     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3768       lea(str1, Address(str1, result, scale));
3769       lea(str2, Address(str2, result, scale));
3770     } else {
3771       lea(str1, Address(str1, result, scale1));
3772       lea(str2, Address(str2, result, scale2));
3773     }
3774     subl(result, stride2);
3775     subl(cnt2, stride2);
3776     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3777     negptr(result);
3778 
3779     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3780     bind(COMPARE_WIDE_VECTORS_LOOP);
3781 
3782     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3783       cmpl(cnt2, stride2x2);
3784       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3785       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3786       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3787 
3788       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3789       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3790         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3791         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3792       } else {
3793         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3794         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3795       }
3796       kortestql(mask, mask);
3797       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3798       addptr(result, stride2x2);  // update since we already compared at this addr
3799       subl(cnt2, stride2x2);      // and sub the size too
3800       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3801 
3802       vpxor(vec1, vec1);
3803       jmpb(COMPARE_WIDE_TAIL);
3804     }//if (VM_Version::supports_avx512vlbw())
3805 
3806     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3807     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3808       vmovdqu(vec1, Address(str1, result, scale));
3809       vpxor(vec1, Address(str2, result, scale));
3810     } else {
3811       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3812       vpxor(vec1, Address(str2, result, scale2));
3813     }
3814     vptest(vec1, vec1);
3815     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3816     addptr(result, stride2);
3817     subl(cnt2, stride2);
3818     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3819     // clean upper bits of YMM registers
3820     vpxor(vec1, vec1);
3821 
3822     // compare wide vectors tail
3823     bind(COMPARE_WIDE_TAIL);
3824     testptr(result, result);
3825     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3826 
3827     movl(result, stride2);
3828     movl(cnt2, result);
3829     negptr(result);
3830     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3831 
3832     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3833     bind(VECTOR_NOT_EQUAL);
3834     // clean upper bits of YMM registers
3835     vpxor(vec1, vec1);
3836     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3837       lea(str1, Address(str1, result, scale));
3838       lea(str2, Address(str2, result, scale));
3839     } else {
3840       lea(str1, Address(str1, result, scale1));
3841       lea(str2, Address(str2, result, scale2));
3842     }
3843     jmp(COMPARE_16_CHARS);
3844 
3845     // Compare tail chars, length between 1 to 15 chars
3846     bind(COMPARE_TAIL_LONG);
3847     movl(cnt2, result);
3848     cmpl(cnt2, stride);
3849     jcc(Assembler::less, COMPARE_SMALL_STR);
3850 
3851     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3852       movdqu(vec1, Address(str1, 0));
3853     } else {
3854       pmovzxbw(vec1, Address(str1, 0));
3855     }
3856     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3857     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3858     subptr(cnt2, stride);
3859     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3860     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3861       lea(str1, Address(str1, result, scale));
3862       lea(str2, Address(str2, result, scale));
3863     } else {
3864       lea(str1, Address(str1, result, scale1));
3865       lea(str2, Address(str2, result, scale2));
3866     }
3867     negptr(cnt2);
3868     jmpb(WHILE_HEAD_LABEL);
3869 
3870     bind(COMPARE_SMALL_STR);
3871   } else if (UseSSE42Intrinsics) {
3872     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3873     int pcmpmask = 0x19;
3874     // Setup to compare 8-char (16-byte) vectors,
3875     // start from first character again because it has aligned address.
3876     movl(result, cnt2);
3877     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3878     if (ae == StrIntrinsicNode::LL) {
3879       pcmpmask &= ~0x01;
3880     }
3881     jcc(Assembler::zero, COMPARE_TAIL);
3882     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3883       lea(str1, Address(str1, result, scale));
3884       lea(str2, Address(str2, result, scale));
3885     } else {
3886       lea(str1, Address(str1, result, scale1));
3887       lea(str2, Address(str2, result, scale2));
3888     }
3889     negptr(result);
3890 
3891     // pcmpestri
3892     //   inputs:
3893     //     vec1- substring
3894     //     rax - negative string length (elements count)
3895     //     mem - scanned string
3896     //     rdx - string length (elements count)
3897     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3898     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3899     //   outputs:
3900     //     rcx - first mismatched element index
3901     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3902 
3903     bind(COMPARE_WIDE_VECTORS);
3904     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3905       movdqu(vec1, Address(str1, result, scale));
3906       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3907     } else {
3908       pmovzxbw(vec1, Address(str1, result, scale1));
3909       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3910     }
3911     // After pcmpestri cnt1(rcx) contains mismatched element index
3912 
3913     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3914     addptr(result, stride);
3915     subptr(cnt2, stride);
3916     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3917 
3918     // compare wide vectors tail
3919     testptr(result, result);
3920     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3921 
3922     movl(cnt2, stride);
3923     movl(result, stride);
3924     negptr(result);
3925     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3926       movdqu(vec1, Address(str1, result, scale));
3927       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3928     } else {
3929       pmovzxbw(vec1, Address(str1, result, scale1));
3930       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3931     }
3932     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3933 
3934     // Mismatched characters in the vectors
3935     bind(VECTOR_NOT_EQUAL);
3936     addptr(cnt1, result);
3937     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3938     subl(result, cnt2);
3939     jmpb(POP_LABEL);
3940 
3941     bind(COMPARE_TAIL); // limit is zero
3942     movl(cnt2, result);
3943     // Fallthru to tail compare
3944   }
3945   // Shift str2 and str1 to the end of the arrays, negate min
3946   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3947     lea(str1, Address(str1, cnt2, scale));
3948     lea(str2, Address(str2, cnt2, scale));
3949   } else {
3950     lea(str1, Address(str1, cnt2, scale1));
3951     lea(str2, Address(str2, cnt2, scale2));
3952   }
3953   decrementl(cnt2);  // first character was compared already
3954   negptr(cnt2);
3955 
3956   // Compare the rest of the elements
3957   bind(WHILE_HEAD_LABEL);
3958   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3959   subl(result, cnt1);
3960   jccb(Assembler::notZero, POP_LABEL);
3961   increment(cnt2);
3962   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3963 
3964   // Strings are equal up to min length.  Return the length difference.
3965   bind(LENGTH_DIFF_LABEL);
3966   pop(result);
3967   if (ae == StrIntrinsicNode::UU) {
3968     // Divide diff by 2 to get number of chars
3969     sarl(result, 1);
3970   }
3971   jmpb(DONE_LABEL);
3972 
3973   if (VM_Version::supports_avx512vlbw()) {
3974 
3975     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3976 
3977     kmovql(cnt1, mask);
3978     notq(cnt1);
3979     bsfq(cnt2, cnt1);
3980     if (ae != StrIntrinsicNode::LL) {
3981       // Divide diff by 2 to get number of chars
3982       sarl(cnt2, 1);
3983     }
3984     addq(result, cnt2);
3985     if (ae == StrIntrinsicNode::LL) {
3986       load_unsigned_byte(cnt1, Address(str2, result));
3987       load_unsigned_byte(result, Address(str1, result));
3988     } else if (ae == StrIntrinsicNode::UU) {
3989       load_unsigned_short(cnt1, Address(str2, result, scale));
3990       load_unsigned_short(result, Address(str1, result, scale));
3991     } else {
3992       load_unsigned_short(cnt1, Address(str2, result, scale2));
3993       load_unsigned_byte(result, Address(str1, result, scale1));
3994     }
3995     subl(result, cnt1);
3996     jmpb(POP_LABEL);
3997   }//if (VM_Version::supports_avx512vlbw())
3998 
3999   // Discard the stored length difference
4000   bind(POP_LABEL);
4001   pop(cnt1);
4002 
4003   // That's it
4004   bind(DONE_LABEL);
4005   if(ae == StrIntrinsicNode::UL) {
4006     negl(result);
4007   }
4008 
4009 }
4010 
4011 // Search for Non-ASCII character (Negative byte value) in a byte array,
4012 // return the index of the first such character, otherwise the length
4013 // of the array segment searched.
4014 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4015 //   @IntrinsicCandidate
4016 //   public static int countPositives(byte[] ba, int off, int len) {
4017 //     for (int i = off; i < off + len; i++) {
4018 //       if (ba[i] < 0) {
4019 //         return i - off;
4020 //       }
4021 //     }
4022 //     return len;
4023 //   }
4024 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4025   Register result, Register tmp1,
4026   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4027   // rsi: byte array
4028   // rcx: len
4029   // rax: result
4030   ShortBranchVerifier sbv(this);
4031   assert_different_registers(ary1, len, result, tmp1);
4032   assert_different_registers(vec1, vec2);
4033   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4034 
4035   movl(result, len); // copy
4036   // len == 0
4037   testl(len, len);
4038   jcc(Assembler::zero, DONE);
4039 
4040   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4041     VM_Version::supports_avx512vlbw() &&
4042     VM_Version::supports_bmi2()) {
4043 
4044     Label test_64_loop, test_tail, BREAK_LOOP;
4045     movl(tmp1, len);
4046     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4047 
4048     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4049     andl(len,  0xffffffc0); // vector count (in chars)
4050     jccb(Assembler::zero, test_tail);
4051 
4052     lea(ary1, Address(ary1, len, Address::times_1));
4053     negptr(len);
4054 
4055     bind(test_64_loop);
4056     // Check whether our 64 elements of size byte contain negatives
4057     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4058     kortestql(mask1, mask1);
4059     jcc(Assembler::notZero, BREAK_LOOP);
4060 
4061     addptr(len, 64);
4062     jccb(Assembler::notZero, test_64_loop);
4063 
4064     bind(test_tail);
4065     // bail out when there is nothing to be done
4066     testl(tmp1, -1);
4067     jcc(Assembler::zero, DONE);
4068 
4069 
4070     // check the tail for absense of negatives
4071     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4072     {
4073       Register tmp3_aliased = len;
4074       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4075       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4076       notq(tmp3_aliased);
4077       kmovql(mask2, tmp3_aliased);
4078     }
4079 
4080     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4081     ktestq(mask1, mask2);
4082     jcc(Assembler::zero, DONE);
4083 
4084     // do a full check for negative registers in the tail
4085     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4086                      // ary1 already pointing to the right place
4087     jmpb(TAIL_START);
4088 
4089     bind(BREAK_LOOP);
4090     // At least one byte in the last 64 byte block was negative.
4091     // Set up to look at the last 64 bytes as if they were a tail
4092     lea(ary1, Address(ary1, len, Address::times_1));
4093     addptr(result, len);
4094     // Ignore the very last byte: if all others are positive,
4095     // it must be negative, so we can skip right to the 2+1 byte
4096     // end comparison at this point
4097     orl(result, 63);
4098     movl(len, 63);
4099     // Fallthru to tail compare
4100   } else {
4101 
4102     if (UseAVX >= 2) {
4103       // With AVX2, use 32-byte vector compare
4104       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4105 
4106       // Compare 32-byte vectors
4107       testl(len, 0xffffffe0);   // vector count (in bytes)
4108       jccb(Assembler::zero, TAIL_START);
4109 
4110       andl(len, 0xffffffe0);
4111       lea(ary1, Address(ary1, len, Address::times_1));
4112       negptr(len);
4113 
4114       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4115       movdl(vec2, tmp1);
4116       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4117 
4118       bind(COMPARE_WIDE_VECTORS);
4119       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4120       vptest(vec1, vec2);
4121       jccb(Assembler::notZero, BREAK_LOOP);
4122       addptr(len, 32);
4123       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4124 
4125       testl(result, 0x0000001f);   // any bytes remaining?
4126       jcc(Assembler::zero, DONE);
4127 
4128       // Quick test using the already prepared vector mask
4129       movl(len, result);
4130       andl(len, 0x0000001f);
4131       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4132       vptest(vec1, vec2);
4133       jcc(Assembler::zero, DONE);
4134       // There are zeros, jump to the tail to determine exactly where
4135       jmpb(TAIL_START);
4136 
4137       bind(BREAK_LOOP);
4138       // At least one byte in the last 32-byte vector is negative.
4139       // Set up to look at the last 32 bytes as if they were a tail
4140       lea(ary1, Address(ary1, len, Address::times_1));
4141       addptr(result, len);
4142       // Ignore the very last byte: if all others are positive,
4143       // it must be negative, so we can skip right to the 2+1 byte
4144       // end comparison at this point
4145       orl(result, 31);
4146       movl(len, 31);
4147       // Fallthru to tail compare
4148     } else if (UseSSE42Intrinsics) {
4149       // With SSE4.2, use double quad vector compare
4150       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4151 
4152       // Compare 16-byte vectors
4153       testl(len, 0xfffffff0);   // vector count (in bytes)
4154       jcc(Assembler::zero, TAIL_START);
4155 
4156       andl(len, 0xfffffff0);
4157       lea(ary1, Address(ary1, len, Address::times_1));
4158       negptr(len);
4159 
4160       movl(tmp1, 0x80808080);
4161       movdl(vec2, tmp1);
4162       pshufd(vec2, vec2, 0);
4163 
4164       bind(COMPARE_WIDE_VECTORS);
4165       movdqu(vec1, Address(ary1, len, Address::times_1));
4166       ptest(vec1, vec2);
4167       jccb(Assembler::notZero, BREAK_LOOP);
4168       addptr(len, 16);
4169       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4170 
4171       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4172       jcc(Assembler::zero, DONE);
4173 
4174       // Quick test using the already prepared vector mask
4175       movl(len, result);
4176       andl(len, 0x0000000f);   // tail count (in bytes)
4177       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4178       ptest(vec1, vec2);
4179       jcc(Assembler::zero, DONE);
4180       jmpb(TAIL_START);
4181 
4182       bind(BREAK_LOOP);
4183       // At least one byte in the last 16-byte vector is negative.
4184       // Set up and look at the last 16 bytes as if they were a tail
4185       lea(ary1, Address(ary1, len, Address::times_1));
4186       addptr(result, len);
4187       // Ignore the very last byte: if all others are positive,
4188       // it must be negative, so we can skip right to the 2+1 byte
4189       // end comparison at this point
4190       orl(result, 15);
4191       movl(len, 15);
4192       // Fallthru to tail compare
4193     }
4194   }
4195 
4196   bind(TAIL_START);
4197   // Compare 4-byte vectors
4198   andl(len, 0xfffffffc); // vector count (in bytes)
4199   jccb(Assembler::zero, COMPARE_CHAR);
4200 
4201   lea(ary1, Address(ary1, len, Address::times_1));
4202   negptr(len);
4203 
4204   bind(COMPARE_VECTORS);
4205   movl(tmp1, Address(ary1, len, Address::times_1));
4206   andl(tmp1, 0x80808080);
4207   jccb(Assembler::notZero, TAIL_ADJUST);
4208   addptr(len, 4);
4209   jccb(Assembler::notZero, COMPARE_VECTORS);
4210 
4211   // Compare trailing char (final 2-3 bytes), if any
4212   bind(COMPARE_CHAR);
4213 
4214   testl(result, 0x2);   // tail  char
4215   jccb(Assembler::zero, COMPARE_BYTE);
4216   load_unsigned_short(tmp1, Address(ary1, 0));
4217   andl(tmp1, 0x00008080);
4218   jccb(Assembler::notZero, CHAR_ADJUST);
4219   lea(ary1, Address(ary1, 2));
4220 
4221   bind(COMPARE_BYTE);
4222   testl(result, 0x1);   // tail  byte
4223   jccb(Assembler::zero, DONE);
4224   load_unsigned_byte(tmp1, Address(ary1, 0));
4225   testl(tmp1, 0x00000080);
4226   jccb(Assembler::zero, DONE);
4227   subptr(result, 1);
4228   jmpb(DONE);
4229 
4230   bind(TAIL_ADJUST);
4231   // there are negative bits in the last 4 byte block.
4232   // Adjust result and check the next three bytes
4233   addptr(result, len);
4234   orl(result, 3);
4235   lea(ary1, Address(ary1, len, Address::times_1));
4236   jmpb(COMPARE_CHAR);
4237 
4238   bind(CHAR_ADJUST);
4239   // We are looking at a char + optional byte tail, and found that one
4240   // of the bytes in the char is negative. Adjust the result, check the
4241   // first byte and readjust if needed.
4242   andl(result, 0xfffffffc);
4243   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4244   jccb(Assembler::notZero, DONE);
4245   addptr(result, 1);
4246 
4247   // That's it
4248   bind(DONE);
4249   if (UseAVX >= 2) {
4250     // clean upper bits of YMM registers
4251     vpxor(vec1, vec1);
4252     vpxor(vec2, vec2);
4253   }
4254 }
4255 
4256 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4257 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4258                                       Register limit, Register result, Register chr,
4259                                       XMMRegister vec1, XMMRegister vec2, bool is_char,
4260                                       KRegister mask, bool expand_ary2) {
4261   // for expand_ary2, limit is the (smaller) size of the second array.
4262   ShortBranchVerifier sbv(this);
4263   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4264 
4265   assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4266          "Expansion only implemented for AVX2");
4267 
4268   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4269   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4270 
4271   Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4272   int scaleIncr = expand_ary2 ? 8 : 16;
4273 
4274   if (is_array_equ) {
4275     // Check the input args
4276     cmpoop(ary1, ary2);
4277     jcc(Assembler::equal, TRUE_LABEL);
4278 
4279     // Need additional checks for arrays_equals.
4280     testptr(ary1, ary1);
4281     jcc(Assembler::zero, FALSE_LABEL);
4282     testptr(ary2, ary2);
4283     jcc(Assembler::zero, FALSE_LABEL);
4284 
4285     // Check the lengths
4286     movl(limit, Address(ary1, length_offset));
4287     cmpl(limit, Address(ary2, length_offset));
4288     jcc(Assembler::notEqual, FALSE_LABEL);
4289   }
4290 
4291   // count == 0
4292   testl(limit, limit);
4293   jcc(Assembler::zero, TRUE_LABEL);
4294 
4295   if (is_array_equ) {
4296     // Load array address
4297     lea(ary1, Address(ary1, base_offset));
4298     lea(ary2, Address(ary2, base_offset));
4299   }
4300 
4301   if (is_array_equ && is_char) {
4302     // arrays_equals when used for char[].
4303     shll(limit, 1);      // byte count != 0
4304   }
4305   movl(result, limit); // copy
4306 
4307   if (UseAVX >= 2) {
4308     // With AVX2, use 32-byte vector compare
4309     Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4310 
4311     // Compare 32-byte vectors
4312     if (expand_ary2) {
4313       andl(result, 0x0000000f);  //   tail count (in bytes)
4314       andl(limit, 0xfffffff0);   // vector count (in bytes)
4315       jcc(Assembler::zero, COMPARE_TAIL);
4316     } else {
4317       andl(result, 0x0000001f);  //   tail count (in bytes)
4318       andl(limit, 0xffffffe0);   // vector count (in bytes)
4319       jcc(Assembler::zero, COMPARE_TAIL_16);
4320     }
4321 
4322     lea(ary1, Address(ary1, limit, scaleFactor));
4323     lea(ary2, Address(ary2, limit, Address::times_1));
4324     negptr(limit);
4325 
4326     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4327       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4328 
4329       cmpl(limit, -64);
4330       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4331 
4332       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4333 
4334       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4335       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4336       kortestql(mask, mask);
4337       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4338       addptr(limit, 64);  // update since we already compared at this addr
4339       cmpl(limit, -64);
4340       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4341 
4342       // At this point we may still need to compare -limit+result bytes.
4343       // We could execute the next two instruction and just continue via non-wide path:
4344       //  cmpl(limit, 0);
4345       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4346       // But since we stopped at the points ary{1,2}+limit which are
4347       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4348       // (|limit| <= 32 and result < 32),
4349       // we may just compare the last 64 bytes.
4350       //
4351       addptr(result, -64);   // it is safe, bc we just came from this area
4352       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4353       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4354       kortestql(mask, mask);
4355       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4356 
4357       jmp(TRUE_LABEL);
4358 
4359       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4360 
4361     }//if (VM_Version::supports_avx512vlbw())
4362 
4363     bind(COMPARE_WIDE_VECTORS);
4364     vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4365     if (expand_ary2) {
4366       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4367     } else {
4368       vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4369     }
4370     vpxor(vec1, vec2);
4371 
4372     vptest(vec1, vec1);
4373     jcc(Assembler::notZero, FALSE_LABEL);
4374     addptr(limit, scaleIncr * 2);
4375     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4376 
4377     testl(result, result);
4378     jcc(Assembler::zero, TRUE_LABEL);
4379 
4380     vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4381     if (expand_ary2) {
4382       vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4383     } else {
4384       vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4385     }
4386     vpxor(vec1, vec2);
4387 
4388     vptest(vec1, vec1);
4389     jcc(Assembler::notZero, FALSE_LABEL);
4390     jmp(TRUE_LABEL);
4391 
4392     bind(COMPARE_TAIL_16); // limit is zero
4393     movl(limit, result);
4394 
4395     // Compare 16-byte chunks
4396     andl(result, 0x0000000f);  //   tail count (in bytes)
4397     andl(limit, 0xfffffff0);   // vector count (in bytes)
4398     jcc(Assembler::zero, COMPARE_TAIL);
4399 
4400     lea(ary1, Address(ary1, limit, scaleFactor));
4401     lea(ary2, Address(ary2, limit, Address::times_1));
4402     negptr(limit);
4403 
4404     bind(COMPARE_WIDE_VECTORS_16);
4405     movdqu(vec1, Address(ary1, limit, scaleFactor));
4406     if (expand_ary2) {
4407       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4408     } else {
4409       movdqu(vec2, Address(ary2, limit, Address::times_1));
4410     }
4411     pxor(vec1, vec2);
4412 
4413     ptest(vec1, vec1);
4414     jcc(Assembler::notZero, FALSE_LABEL);
4415     addptr(limit, scaleIncr);
4416     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4417 
4418     bind(COMPARE_TAIL); // limit is zero
4419     movl(limit, result);
4420     // Fallthru to tail compare
4421   } else if (UseSSE42Intrinsics) {
4422     // With SSE4.2, use double quad vector compare
4423     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4424 
4425     // Compare 16-byte vectors
4426     andl(result, 0x0000000f);  //   tail count (in bytes)
4427     andl(limit, 0xfffffff0);   // vector count (in bytes)
4428     jcc(Assembler::zero, COMPARE_TAIL);
4429 
4430     lea(ary1, Address(ary1, limit, Address::times_1));
4431     lea(ary2, Address(ary2, limit, Address::times_1));
4432     negptr(limit);
4433 
4434     bind(COMPARE_WIDE_VECTORS);
4435     movdqu(vec1, Address(ary1, limit, Address::times_1));
4436     movdqu(vec2, Address(ary2, limit, Address::times_1));
4437     pxor(vec1, vec2);
4438 
4439     ptest(vec1, vec1);
4440     jcc(Assembler::notZero, FALSE_LABEL);
4441     addptr(limit, 16);
4442     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4443 
4444     testl(result, result);
4445     jcc(Assembler::zero, TRUE_LABEL);
4446 
4447     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4448     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4449     pxor(vec1, vec2);
4450 
4451     ptest(vec1, vec1);
4452     jccb(Assembler::notZero, FALSE_LABEL);
4453     jmpb(TRUE_LABEL);
4454 
4455     bind(COMPARE_TAIL); // limit is zero
4456     movl(limit, result);
4457     // Fallthru to tail compare
4458   }
4459 
4460   // Compare 4-byte vectors
4461   if (expand_ary2) {
4462     testl(result, result);
4463     jccb(Assembler::zero, TRUE_LABEL);
4464   } else {
4465     andl(limit, 0xfffffffc); // vector count (in bytes)
4466     jccb(Assembler::zero, COMPARE_CHAR);
4467   }
4468 
4469   lea(ary1, Address(ary1, limit, scaleFactor));
4470   lea(ary2, Address(ary2, limit, Address::times_1));
4471   negptr(limit);
4472 
4473   bind(COMPARE_VECTORS);
4474   if (expand_ary2) {
4475     // There are no "vector" operations for bytes to shorts
4476     movzbl(chr, Address(ary2, limit, Address::times_1));
4477     cmpw(Address(ary1, limit, Address::times_2), chr);
4478     jccb(Assembler::notEqual, FALSE_LABEL);
4479     addptr(limit, 1);
4480     jcc(Assembler::notZero, COMPARE_VECTORS);
4481     jmp(TRUE_LABEL);
4482   } else {
4483     movl(chr, Address(ary1, limit, Address::times_1));
4484     cmpl(chr, Address(ary2, limit, Address::times_1));
4485     jccb(Assembler::notEqual, FALSE_LABEL);
4486     addptr(limit, 4);
4487     jcc(Assembler::notZero, COMPARE_VECTORS);
4488   }
4489 
4490   // Compare trailing char (final 2 bytes), if any
4491   bind(COMPARE_CHAR);
4492   testl(result, 0x2);   // tail  char
4493   jccb(Assembler::zero, COMPARE_BYTE);
4494   load_unsigned_short(chr, Address(ary1, 0));
4495   load_unsigned_short(limit, Address(ary2, 0));
4496   cmpl(chr, limit);
4497   jccb(Assembler::notEqual, FALSE_LABEL);
4498 
4499   if (is_array_equ && is_char) {
4500     bind(COMPARE_BYTE);
4501   } else {
4502     lea(ary1, Address(ary1, 2));
4503     lea(ary2, Address(ary2, 2));
4504 
4505     bind(COMPARE_BYTE);
4506     testl(result, 0x1);   // tail  byte
4507     jccb(Assembler::zero, TRUE_LABEL);
4508     load_unsigned_byte(chr, Address(ary1, 0));
4509     load_unsigned_byte(limit, Address(ary2, 0));
4510     cmpl(chr, limit);
4511     jccb(Assembler::notEqual, FALSE_LABEL);
4512   }
4513   bind(TRUE_LABEL);
4514   movl(result, 1);   // return true
4515   jmpb(DONE);
4516 
4517   bind(FALSE_LABEL);
4518   xorl(result, result); // return false
4519 
4520   // That's it
4521   bind(DONE);
4522   if (UseAVX >= 2) {
4523     // clean upper bits of YMM registers
4524     vpxor(vec1, vec1);
4525     vpxor(vec2, vec2);
4526   }
4527 }
4528 
4529 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4530 #define __ masm.
4531   Register dst = stub.data<0>();
4532   XMMRegister src = stub.data<1>();
4533   address target = stub.data<2>();
4534   __ bind(stub.entry());
4535   __ subptr(rsp, 8);
4536   __ movdbl(Address(rsp), src);
4537   __ call(RuntimeAddress(target));
4538   // APX REX2 encoding for pop(dst) increases the stub size by 1 byte.
4539   __ pop(dst);
4540   __ jmp(stub.continuation());
4541 #undef __
4542 }
4543 
4544 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4545   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4546   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4547 
4548   address slowpath_target;
4549   if (dst_bt == T_INT) {
4550     if (src_bt == T_FLOAT) {
4551       cvttss2sil(dst, src);
4552       cmpl(dst, 0x80000000);
4553       slowpath_target = StubRoutines::x86::f2i_fixup();
4554     } else {
4555       cvttsd2sil(dst, src);
4556       cmpl(dst, 0x80000000);
4557       slowpath_target = StubRoutines::x86::d2i_fixup();
4558     }
4559   } else {
4560     if (src_bt == T_FLOAT) {
4561       cvttss2siq(dst, src);
4562       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4563       slowpath_target = StubRoutines::x86::f2l_fixup();
4564     } else {
4565       cvttsd2siq(dst, src);
4566       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4567       slowpath_target = StubRoutines::x86::d2l_fixup();
4568     }
4569   }
4570 
4571   // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte.
4572   int max_size = 23 + (UseAPX ? 1 : 0);
4573   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath);
4574   jcc(Assembler::equal, stub->entry());
4575   bind(stub->continuation());
4576 }
4577 
4578 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4579                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4580   switch(ideal_opc) {
4581     case Op_LShiftVS:
4582       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4583     case Op_LShiftVI:
4584       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4585     case Op_LShiftVL:
4586       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4587     case Op_RShiftVS:
4588       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4589     case Op_RShiftVI:
4590       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4591     case Op_RShiftVL:
4592       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4593     case Op_URShiftVS:
4594       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4595     case Op_URShiftVI:
4596       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4597     case Op_URShiftVL:
4598       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4599     case Op_RotateRightV:
4600       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4601     case Op_RotateLeftV:
4602       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4603     default:
4604       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4605       break;
4606   }
4607 }
4608 
4609 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4610                                                XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4611   if (is_unsigned) {
4612     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4613   } else {
4614     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4615   }
4616 }
4617 
4618 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4619                                                       XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4620   switch (elem_bt) {
4621     case T_BYTE:
4622       if (ideal_opc == Op_SaturatingAddV) {
4623         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4624       } else {
4625         assert(ideal_opc == Op_SaturatingSubV, "");
4626         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4627       }
4628       break;
4629     case T_SHORT:
4630       if (ideal_opc == Op_SaturatingAddV) {
4631         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4632       } else {
4633         assert(ideal_opc == Op_SaturatingSubV, "");
4634         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4635       }
4636       break;
4637     default:
4638       fatal("Unsupported type %s", type2name(elem_bt));
4639       break;
4640   }
4641 }
4642 
4643 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4644                                                         XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4645   switch (elem_bt) {
4646     case T_BYTE:
4647       if (ideal_opc == Op_SaturatingAddV) {
4648         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4649       } else {
4650         assert(ideal_opc == Op_SaturatingSubV, "");
4651         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4652       }
4653       break;
4654     case T_SHORT:
4655       if (ideal_opc == Op_SaturatingAddV) {
4656         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4657       } else {
4658         assert(ideal_opc == Op_SaturatingSubV, "");
4659         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4660       }
4661       break;
4662     default:
4663       fatal("Unsupported type %s", type2name(elem_bt));
4664       break;
4665   }
4666 }
4667 
4668 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4669                                                Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4670   if (is_unsigned) {
4671     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4672   } else {
4673     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4674   }
4675 }
4676 
4677 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4678                                                       XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4679   switch (elem_bt) {
4680     case T_BYTE:
4681       if (ideal_opc == Op_SaturatingAddV) {
4682         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4683       } else {
4684         assert(ideal_opc == Op_SaturatingSubV, "");
4685         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4686       }
4687       break;
4688     case T_SHORT:
4689       if (ideal_opc == Op_SaturatingAddV) {
4690         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4691       } else {
4692         assert(ideal_opc == Op_SaturatingSubV, "");
4693         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4694       }
4695       break;
4696     default:
4697       fatal("Unsupported type %s", type2name(elem_bt));
4698       break;
4699   }
4700 }
4701 
4702 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4703                                                         XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4704   switch (elem_bt) {
4705     case T_BYTE:
4706       if (ideal_opc == Op_SaturatingAddV) {
4707         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4708       } else {
4709         assert(ideal_opc == Op_SaturatingSubV, "");
4710         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4711       }
4712       break;
4713     case T_SHORT:
4714       if (ideal_opc == Op_SaturatingAddV) {
4715         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4716       } else {
4717         assert(ideal_opc == Op_SaturatingSubV, "");
4718         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4719       }
4720       break;
4721     default:
4722       fatal("Unsupported type %s", type2name(elem_bt));
4723       break;
4724   }
4725 }
4726 
4727 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4728                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4729                                     bool is_varshift) {
4730   switch (ideal_opc) {
4731     case Op_AddVB:
4732       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4733     case Op_AddVS:
4734       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4735     case Op_AddVI:
4736       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4737     case Op_AddVL:
4738       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4739     case Op_AddVF:
4740       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4741     case Op_AddVD:
4742       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4743     case Op_SubVB:
4744       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4745     case Op_SubVS:
4746       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4747     case Op_SubVI:
4748       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4749     case Op_SubVL:
4750       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4751     case Op_SubVF:
4752       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4753     case Op_SubVD:
4754       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4755     case Op_MulVS:
4756       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4757     case Op_MulVI:
4758       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4759     case Op_MulVL:
4760       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4761     case Op_MulVF:
4762       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4763     case Op_MulVD:
4764       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4765     case Op_DivVF:
4766       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4767     case Op_DivVD:
4768       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4769     case Op_SqrtVF:
4770       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4771     case Op_SqrtVD:
4772       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4773     case Op_AbsVB:
4774       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4775     case Op_AbsVS:
4776       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4777     case Op_AbsVI:
4778       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4779     case Op_AbsVL:
4780       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4781     case Op_FmaVF:
4782       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4783     case Op_FmaVD:
4784       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4785     case Op_VectorRearrange:
4786       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4787     case Op_LShiftVS:
4788       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4789     case Op_LShiftVI:
4790       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4791     case Op_LShiftVL:
4792       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4793     case Op_RShiftVS:
4794       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4795     case Op_RShiftVI:
4796       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4797     case Op_RShiftVL:
4798       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4799     case Op_URShiftVS:
4800       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4801     case Op_URShiftVI:
4802       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4803     case Op_URShiftVL:
4804       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4805     case Op_RotateLeftV:
4806       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4807     case Op_RotateRightV:
4808       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4809     case Op_MaxV:
4810       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4811     case Op_MinV:
4812       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4813     case Op_UMinV:
4814       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4815     case Op_UMaxV:
4816       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4817     case Op_XorV:
4818       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4819     case Op_OrV:
4820       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4821     case Op_AndV:
4822       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4823     default:
4824       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4825       break;
4826   }
4827 }
4828 
4829 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4830                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4831   switch (ideal_opc) {
4832     case Op_AddVB:
4833       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4834     case Op_AddVS:
4835       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4836     case Op_AddVI:
4837       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4838     case Op_AddVL:
4839       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4840     case Op_AddVF:
4841       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4842     case Op_AddVD:
4843       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4844     case Op_SubVB:
4845       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4846     case Op_SubVS:
4847       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4848     case Op_SubVI:
4849       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4850     case Op_SubVL:
4851       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4852     case Op_SubVF:
4853       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4854     case Op_SubVD:
4855       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4856     case Op_MulVS:
4857       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4858     case Op_MulVI:
4859       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4860     case Op_MulVL:
4861       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4862     case Op_MulVF:
4863       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4864     case Op_MulVD:
4865       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4866     case Op_DivVF:
4867       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4868     case Op_DivVD:
4869       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4870     case Op_FmaVF:
4871       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4872     case Op_FmaVD:
4873       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4874     case Op_MaxV:
4875       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4876     case Op_MinV:
4877       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4878     case Op_UMaxV:
4879       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4880     case Op_UMinV:
4881       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4882     case Op_XorV:
4883       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4884     case Op_OrV:
4885       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4886     case Op_AndV:
4887       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4888     default:
4889       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4890       break;
4891   }
4892 }
4893 
4894 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4895                                   KRegister src1, KRegister src2) {
4896   BasicType etype = T_ILLEGAL;
4897   switch(mask_len) {
4898     case 2:
4899     case 4:
4900     case 8:  etype = T_BYTE; break;
4901     case 16: etype = T_SHORT; break;
4902     case 32: etype = T_INT; break;
4903     case 64: etype = T_LONG; break;
4904     default: fatal("Unsupported type"); break;
4905   }
4906   assert(etype != T_ILLEGAL, "");
4907   switch(ideal_opc) {
4908     case Op_AndVMask:
4909       kand(etype, dst, src1, src2); break;
4910     case Op_OrVMask:
4911       kor(etype, dst, src1, src2); break;
4912     case Op_XorVMask:
4913       kxor(etype, dst, src1, src2); break;
4914     default:
4915       fatal("Unsupported masked operation"); break;
4916   }
4917 }
4918 
4919 /*
4920  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4921  * If src is NaN, the result is 0.
4922  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4923  * the result is equal to the value of Integer.MIN_VALUE.
4924  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4925  * the result is equal to the value of Integer.MAX_VALUE.
4926  */
4927 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4928                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4929                                                                    Register rscratch, AddressLiteral float_sign_flip,
4930                                                                    int vec_enc) {
4931   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4932   Label done;
4933   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4934   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4935   vptest(xtmp2, xtmp2, vec_enc);
4936   jccb(Assembler::equal, done);
4937 
4938   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4939   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4940 
4941   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4942   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4943   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4944 
4945   // Recompute the mask for remaining special value.
4946   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4947   // Extract SRC values corresponding to TRUE mask lanes.
4948   vpand(xtmp4, xtmp2, src, vec_enc);
4949   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4950   // values are set.
4951   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4952 
4953   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4954   bind(done);
4955 }
4956 
4957 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4958                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4959                                                                     Register rscratch, AddressLiteral float_sign_flip,
4960                                                                     int vec_enc) {
4961   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4962   Label done;
4963   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4964   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4965   kortestwl(ktmp1, ktmp1);
4966   jccb(Assembler::equal, done);
4967 
4968   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4969   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4970   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4971 
4972   kxorwl(ktmp1, ktmp1, ktmp2);
4973   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4974   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4975   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4976   bind(done);
4977 }
4978 
4979 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4980                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4981                                                                      Register rscratch, AddressLiteral double_sign_flip,
4982                                                                      int vec_enc) {
4983   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4984 
4985   Label done;
4986   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4987   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4988   kortestwl(ktmp1, ktmp1);
4989   jccb(Assembler::equal, done);
4990 
4991   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4992   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4993   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4994 
4995   kxorwl(ktmp1, ktmp1, ktmp2);
4996   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4997   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4998   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4999   bind(done);
5000 }
5001 
5002 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5003                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5004                                                                      Register rscratch, AddressLiteral float_sign_flip,
5005                                                                      int vec_enc) {
5006   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5007   Label done;
5008   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5009   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5010   kortestwl(ktmp1, ktmp1);
5011   jccb(Assembler::equal, done);
5012 
5013   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5014   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5015   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5016 
5017   kxorwl(ktmp1, ktmp1, ktmp2);
5018   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5019   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5020   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5021   bind(done);
5022 }
5023 
5024 /*
5025  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5026  * If src is NaN, the result is 0.
5027  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
5028  * the result is equal to the value of Long.MIN_VALUE.
5029  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
5030  * the result is equal to the value of Long.MAX_VALUE.
5031  */
5032 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5033                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5034                                                                       Register rscratch, AddressLiteral double_sign_flip,
5035                                                                       int vec_enc) {
5036   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5037 
5038   Label done;
5039   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5040   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
5041   kortestwl(ktmp1, ktmp1);
5042   jccb(Assembler::equal, done);
5043 
5044   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5045   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5046   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5047 
5048   kxorwl(ktmp1, ktmp1, ktmp2);
5049   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5050   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5051   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5052   bind(done);
5053 }
5054 
5055 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5056                                                              XMMRegister xtmp, int index, int vec_enc) {
5057    assert(vec_enc < Assembler::AVX_512bit, "");
5058    if (vec_enc == Assembler::AVX_256bit) {
5059      vextractf128_high(xtmp, src);
5060      vshufps(dst, src, xtmp, index, vec_enc);
5061    } else {
5062      vshufps(dst, src, zero, index, vec_enc);
5063    }
5064 }
5065 
5066 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5067                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5068                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
5069   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5070 
5071   Label done;
5072   // Compare the destination lanes with float_sign_flip
5073   // value to get mask for all special values.
5074   movdqu(xtmp1, float_sign_flip, rscratch);
5075   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5076   ptest(xtmp2, xtmp2);
5077   jccb(Assembler::equal, done);
5078 
5079   // Flip float_sign_flip to get max integer value.
5080   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5081   pxor(xtmp1, xtmp4);
5082 
5083   // Set detination lanes corresponding to unordered source lanes as zero.
5084   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5085   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5086 
5087   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5088   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5089   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5090 
5091   // Recompute the mask for remaining special value.
5092   pxor(xtmp2, xtmp3);
5093   // Extract mask corresponding to non-negative source lanes.
5094   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5095 
5096   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5097   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5098   pand(xtmp3, xtmp2);
5099 
5100   // Replace destination lanes holding special value(0x80000000) with max int
5101   // if corresponding source lane holds a +ve value.
5102   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5103   bind(done);
5104 }
5105 
5106 
5107 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5108                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5109   switch(to_elem_bt) {
5110     case T_SHORT:
5111       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5112       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5113       vpackusdw(dst, dst, zero, vec_enc);
5114       if (vec_enc == Assembler::AVX_256bit) {
5115         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5116       }
5117       break;
5118     case  T_BYTE:
5119       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5120       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5121       vpackusdw(dst, dst, zero, vec_enc);
5122       if (vec_enc == Assembler::AVX_256bit) {
5123         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5124       }
5125       vpackuswb(dst, dst, zero, vec_enc);
5126       break;
5127     default: assert(false, "Unexpected basic type for target of vector cast int to subword: %s", type2name(to_elem_bt));
5128   }
5129 }
5130 
5131 /*
5132  * Algorithm for vector D2L and F2I conversions (AVX 10.2 unsupported):-
5133  * a) Perform vector D2L/F2I cast.
5134  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5135  *    It signifies that source value could be any of the special floating point
5136  *    values(NaN,-Inf,Inf,Max,-Min).
5137  * c) Set destination to zero if source is NaN value.
5138  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5139  */
5140 
5141 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5142                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5143                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5144   int to_elem_sz = type2aelembytes(to_elem_bt);
5145   assert(to_elem_sz <= 4, "");
5146   vcvttps2dq(dst, src, vec_enc);
5147   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5148   if (to_elem_sz < 4) {
5149     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5150     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5151   }
5152 }
5153 
5154 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5155                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5156                                             Register rscratch, int vec_enc) {
5157   int to_elem_sz = type2aelembytes(to_elem_bt);
5158   assert(to_elem_sz <= 4, "");
5159   vcvttps2dq(dst, src, vec_enc);
5160   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5161   switch(to_elem_bt) {
5162     case T_INT:
5163       break;
5164     case T_SHORT:
5165       evpmovdw(dst, dst, vec_enc);
5166       break;
5167     case T_BYTE:
5168       evpmovdb(dst, dst, vec_enc);
5169       break;
5170     default: assert(false, "Unexpected basic type for target of vector castF2X EVEX: %s", type2name(to_elem_bt));
5171   }
5172 }
5173 
5174 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5175                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5176                                             Register rscratch, int vec_enc) {
5177   evcvttps2qq(dst, src, vec_enc);
5178   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5179 }
5180 
5181 // Handling for downcasting from double to integer or sub-word types on AVX2.
5182 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5183                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5184                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5185   int to_elem_sz = type2aelembytes(to_elem_bt);
5186   assert(to_elem_sz < 8, "");
5187   vcvttpd2dq(dst, src, vec_enc);
5188   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5189                                               float_sign_flip, vec_enc);
5190   if (to_elem_sz < 4) {
5191     // xtmp4 holds all zero lanes.
5192     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5193   }
5194 }
5195 
5196 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5197                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5198                                             KRegister ktmp2, AddressLiteral sign_flip,
5199                                             Register rscratch, int vec_enc) {
5200   if (VM_Version::supports_avx512dq()) {
5201     evcvttpd2qq(dst, src, vec_enc);
5202     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5203     switch(to_elem_bt) {
5204       case T_LONG:
5205         break;
5206       case T_INT:
5207         evpmovsqd(dst, dst, vec_enc);
5208         break;
5209       case T_SHORT:
5210         evpmovsqd(dst, dst, vec_enc);
5211         evpmovdw(dst, dst, vec_enc);
5212         break;
5213       case T_BYTE:
5214         evpmovsqd(dst, dst, vec_enc);
5215         evpmovdb(dst, dst, vec_enc);
5216         break;
5217       default: assert(false, "Unexpected basic type for target of vector castD2X AVX512DQ EVEX: %s", type2name(to_elem_bt));
5218     }
5219   } else {
5220     assert(type2aelembytes(to_elem_bt) <= 4, "");
5221     vcvttpd2dq(dst, src, vec_enc);
5222     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5223     switch(to_elem_bt) {
5224       case T_INT:
5225         break;
5226       case T_SHORT:
5227         evpmovdw(dst, dst, vec_enc);
5228         break;
5229       case T_BYTE:
5230         evpmovdb(dst, dst, vec_enc);
5231         break;
5232       default: assert(false, "Unexpected basic type for target of vector castD2X EVEX: %s", type2name(to_elem_bt));
5233     }
5234   }
5235 }
5236 
5237 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5238   switch(to_elem_bt) {
5239     case T_LONG:
5240       evcvttps2qqs(dst, src, vec_enc);
5241       break;
5242     case T_INT:
5243       evcvttps2dqs(dst, src, vec_enc);
5244       break;
5245     case T_SHORT:
5246       evcvttps2dqs(dst, src, vec_enc);
5247       evpmovdw(dst, dst, vec_enc);
5248       break;
5249     case T_BYTE:
5250       evcvttps2dqs(dst, src, vec_enc);
5251       evpmovdb(dst, dst, vec_enc);
5252       break;
5253     default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (reg src): %s", type2name(to_elem_bt));
5254   }
5255 }
5256 
5257 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5258   switch(to_elem_bt) {
5259     case T_LONG:
5260       evcvttps2qqs(dst, src, vec_enc);
5261       break;
5262     case T_INT:
5263       evcvttps2dqs(dst, src, vec_enc);
5264       break;
5265     case T_SHORT:
5266       evcvttps2dqs(dst, src, vec_enc);
5267       evpmovdw(dst, dst, vec_enc);
5268       break;
5269     case T_BYTE:
5270       evcvttps2dqs(dst, src, vec_enc);
5271       evpmovdb(dst, dst, vec_enc);
5272       break;
5273     default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (mem src): %s", type2name(to_elem_bt));
5274   }
5275 }
5276 
5277 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5278   switch(to_elem_bt) {
5279     case T_LONG:
5280       evcvttpd2qqs(dst, src, vec_enc);
5281       break;
5282     case T_INT:
5283       evcvttpd2dqs(dst, src, vec_enc);
5284       break;
5285     case T_SHORT:
5286       evcvttpd2dqs(dst, src, vec_enc);
5287       evpmovdw(dst, dst, vec_enc);
5288       break;
5289     case T_BYTE:
5290       evcvttpd2dqs(dst, src, vec_enc);
5291       evpmovdb(dst, dst, vec_enc);
5292       break;
5293     default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (reg src): %s", type2name(to_elem_bt));
5294   }
5295 }
5296 
5297 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5298   switch(to_elem_bt) {
5299     case T_LONG:
5300       evcvttpd2qqs(dst, src, vec_enc);
5301       break;
5302     case T_INT:
5303       evcvttpd2dqs(dst, src, vec_enc);
5304       break;
5305     case T_SHORT:
5306       evcvttpd2dqs(dst, src, vec_enc);
5307       evpmovdw(dst, dst, vec_enc);
5308       break;
5309     case T_BYTE:
5310       evcvttpd2dqs(dst, src, vec_enc);
5311       evpmovdb(dst, dst, vec_enc);
5312       break;
5313     default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (mem src): %s", type2name(to_elem_bt));
5314   }
5315 }
5316 
5317 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5318                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5319                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5320   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5321   // and re-instantiate original MXCSR.RC mode after that.
5322   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5323 
5324   mov64(tmp, julong_cast(0.5L));
5325   evpbroadcastq(xtmp1, tmp, vec_enc);
5326   vaddpd(xtmp1, src , xtmp1, vec_enc);
5327   evcvtpd2qq(dst, xtmp1, vec_enc);
5328   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5329                                                 double_sign_flip, vec_enc);;
5330 
5331   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5332 }
5333 
5334 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5335                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5336                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5337   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5338   // and re-instantiate original MXCSR.RC mode after that.
5339   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5340 
5341   movl(tmp, jint_cast(0.5));
5342   movq(xtmp1, tmp);
5343   vbroadcastss(xtmp1, xtmp1, vec_enc);
5344   vaddps(xtmp1, src , xtmp1, vec_enc);
5345   vcvtps2dq(dst, xtmp1, vec_enc);
5346   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5347                                               float_sign_flip, vec_enc);
5348 
5349   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5350 }
5351 
5352 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5353                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5354                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5355   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5356   // and re-instantiate original MXCSR.RC mode after that.
5357   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5358 
5359   movl(tmp, jint_cast(0.5));
5360   movq(xtmp1, tmp);
5361   vbroadcastss(xtmp1, xtmp1, vec_enc);
5362   vaddps(xtmp1, src , xtmp1, vec_enc);
5363   vcvtps2dq(dst, xtmp1, vec_enc);
5364   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5365 
5366   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5367 }
5368 
5369 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5370                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5371   switch (from_elem_bt) {
5372     case T_BYTE:
5373       switch (to_elem_bt) {
5374         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5375         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5376         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5377         default: ShouldNotReachHere();
5378       }
5379       break;
5380     case T_SHORT:
5381       switch (to_elem_bt) {
5382         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5383         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5384         default: ShouldNotReachHere();
5385       }
5386       break;
5387     case T_INT:
5388       assert(to_elem_bt == T_LONG, "");
5389       vpmovzxdq(dst, src, vlen_enc);
5390       break;
5391     default:
5392       ShouldNotReachHere();
5393   }
5394 }
5395 
5396 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5397                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5398   switch (from_elem_bt) {
5399     case T_BYTE:
5400       switch (to_elem_bt) {
5401         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5402         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5403         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5404         default: ShouldNotReachHere();
5405       }
5406       break;
5407     case T_SHORT:
5408       switch (to_elem_bt) {
5409         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5410         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5411         default: ShouldNotReachHere();
5412       }
5413       break;
5414     case T_INT:
5415       assert(to_elem_bt == T_LONG, "");
5416       vpmovsxdq(dst, src, vlen_enc);
5417       break;
5418     default:
5419       ShouldNotReachHere();
5420   }
5421 }
5422 
5423 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5424                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5425   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5426   assert(vlen_enc != AVX_512bit, "");
5427 
5428   int dst_bt_size = type2aelembytes(dst_bt);
5429   int src_bt_size = type2aelembytes(src_bt);
5430   if (dst_bt_size > src_bt_size) {
5431     switch (dst_bt_size / src_bt_size) {
5432       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5433       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5434       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5435       default: ShouldNotReachHere();
5436     }
5437   } else {
5438     assert(dst_bt_size < src_bt_size, "");
5439     switch (src_bt_size / dst_bt_size) {
5440       case 2: {
5441         if (vlen_enc == AVX_128bit) {
5442           vpacksswb(dst, src, src, vlen_enc);
5443         } else {
5444           vpacksswb(dst, src, src, vlen_enc);
5445           vpermq(dst, dst, 0x08, vlen_enc);
5446         }
5447         break;
5448       }
5449       case 4: {
5450         if (vlen_enc == AVX_128bit) {
5451           vpackssdw(dst, src, src, vlen_enc);
5452           vpacksswb(dst, dst, dst, vlen_enc);
5453         } else {
5454           vpackssdw(dst, src, src, vlen_enc);
5455           vpermq(dst, dst, 0x08, vlen_enc);
5456           vpacksswb(dst, dst, dst, AVX_128bit);
5457         }
5458         break;
5459       }
5460       case 8: {
5461         if (vlen_enc == AVX_128bit) {
5462           vpshufd(dst, src, 0x08, vlen_enc);
5463           vpackssdw(dst, dst, dst, vlen_enc);
5464           vpacksswb(dst, dst, dst, vlen_enc);
5465         } else {
5466           vpshufd(dst, src, 0x08, vlen_enc);
5467           vpermq(dst, dst, 0x08, vlen_enc);
5468           vpackssdw(dst, dst, dst, AVX_128bit);
5469           vpacksswb(dst, dst, dst, AVX_128bit);
5470         }
5471         break;
5472       }
5473       default: ShouldNotReachHere();
5474     }
5475   }
5476 }
5477 
5478 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5479                                    bool merge, BasicType bt, int vlen_enc) {
5480   if (bt == T_INT) {
5481     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5482   } else {
5483     assert(bt == T_LONG, "");
5484     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5485   }
5486 }
5487 
5488 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5489                                    bool merge, BasicType bt, int vlen_enc) {
5490   if (bt == T_INT) {
5491     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5492   } else {
5493     assert(bt == T_LONG, "");
5494     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5495   }
5496 }
5497 
5498 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5499                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5500                                                int vec_enc) {
5501   int index = 0;
5502   int vindex = 0;
5503   mov64(rtmp1, 0x0101010101010101L);
5504   pdepq(rtmp1, src, rtmp1);
5505   if (mask_len > 8) {
5506     movq(rtmp2, src);
5507     vpxor(xtmp, xtmp, xtmp, vec_enc);
5508     movq(xtmp, rtmp1);
5509   }
5510   movq(dst, rtmp1);
5511 
5512   mask_len -= 8;
5513   while (mask_len > 0) {
5514     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5515     index++;
5516     if ((index % 2) == 0) {
5517       pxor(xtmp, xtmp);
5518     }
5519     mov64(rtmp1, 0x0101010101010101L);
5520     shrq(rtmp2, 8);
5521     pdepq(rtmp1, rtmp2, rtmp1);
5522     pinsrq(xtmp, rtmp1, index % 2);
5523     vindex = index / 2;
5524     if (vindex) {
5525       // Write entire 16 byte vector when both 64 bit
5526       // lanes are update to save redundant instructions.
5527       if (index % 2) {
5528         vinsertf128(dst, dst, xtmp, vindex);
5529       }
5530     } else {
5531       vmovdqu(dst, xtmp);
5532     }
5533     mask_len -= 8;
5534   }
5535 }
5536 
5537 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5538   switch(opc) {
5539     case Op_VectorMaskTrueCount:
5540       popcntq(dst, tmp);
5541       break;
5542     case Op_VectorMaskLastTrue:
5543       if (VM_Version::supports_lzcnt()) {
5544         lzcntq(tmp, tmp);
5545         movl(dst, 63);
5546         subl(dst, tmp);
5547       } else {
5548         movl(dst, -1);
5549         bsrq(tmp, tmp);
5550         cmov32(Assembler::notZero, dst, tmp);
5551       }
5552       break;
5553     case Op_VectorMaskFirstTrue:
5554       if (VM_Version::supports_bmi1()) {
5555         if (masklen < 32) {
5556           orl(tmp, 1 << masklen);
5557           tzcntl(dst, tmp);
5558         } else if (masklen == 32) {
5559           tzcntl(dst, tmp);
5560         } else {
5561           assert(masklen == 64, "");
5562           tzcntq(dst, tmp);
5563         }
5564       } else {
5565         if (masklen < 32) {
5566           orl(tmp, 1 << masklen);
5567           bsfl(dst, tmp);
5568         } else {
5569           assert(masklen == 32 || masklen == 64, "");
5570           movl(dst, masklen);
5571           if (masklen == 32)  {
5572             bsfl(tmp, tmp);
5573           } else {
5574             bsfq(tmp, tmp);
5575           }
5576           cmov32(Assembler::notZero, dst, tmp);
5577         }
5578       }
5579       break;
5580     case Op_VectorMaskToLong:
5581       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5582       break;
5583     default: assert(false, "Unhandled mask operation");
5584   }
5585 }
5586 
5587 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5588                                               int masklen, int masksize, int vec_enc) {
5589   assert(VM_Version::supports_popcnt(), "");
5590 
5591   if(VM_Version::supports_avx512bw()) {
5592     kmovql(tmp, mask);
5593   } else {
5594     assert(masklen <= 16, "");
5595     kmovwl(tmp, mask);
5596   }
5597 
5598   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5599   // operations needs to be clipped.
5600   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5601     andq(tmp, (1 << masklen) - 1);
5602   }
5603 
5604   vector_mask_operation_helper(opc, dst, tmp, masklen);
5605 }
5606 
5607 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5608                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5609   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5610          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5611   assert(VM_Version::supports_popcnt(), "");
5612 
5613   bool need_clip = false;
5614   switch(bt) {
5615     case T_BOOLEAN:
5616       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5617       vpxor(xtmp, xtmp, xtmp, vec_enc);
5618       vpsubb(xtmp, xtmp, mask, vec_enc);
5619       vpmovmskb(tmp, xtmp, vec_enc);
5620       need_clip = masklen < 16;
5621       break;
5622     case T_BYTE:
5623       vpmovmskb(tmp, mask, vec_enc);
5624       need_clip = masklen < 16;
5625       break;
5626     case T_SHORT:
5627       vpacksswb(xtmp, mask, mask, vec_enc);
5628       if (masklen >= 16) {
5629         vpermpd(xtmp, xtmp, 8, vec_enc);
5630       }
5631       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5632       need_clip = masklen < 16;
5633       break;
5634     case T_INT:
5635     case T_FLOAT:
5636       vmovmskps(tmp, mask, vec_enc);
5637       need_clip = masklen < 4;
5638       break;
5639     case T_LONG:
5640     case T_DOUBLE:
5641       vmovmskpd(tmp, mask, vec_enc);
5642       need_clip = masklen < 2;
5643       break;
5644     default: assert(false, "Unhandled type, %s", type2name(bt));
5645   }
5646 
5647   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5648   // operations needs to be clipped.
5649   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5650     // need_clip implies masklen < 32
5651     andq(tmp, (1 << masklen) - 1);
5652   }
5653 
5654   vector_mask_operation_helper(opc, dst, tmp, masklen);
5655 }
5656 
5657 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5658                                              Register rtmp2, int mask_len) {
5659   kmov(rtmp1, src);
5660   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5661   mov64(rtmp2, -1L);
5662   pextq(rtmp2, rtmp2, rtmp1);
5663   kmov(dst, rtmp2);
5664 }
5665 
5666 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5667                                                     XMMRegister mask, Register rtmp, Register rscratch,
5668                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5669                                                     int vec_enc) {
5670   assert(type2aelembytes(bt) >= 4, "");
5671   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5672   address compress_perm_table = nullptr;
5673   address expand_perm_table = nullptr;
5674   if (type2aelembytes(bt) == 8) {
5675     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5676     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5677     vmovmskpd(rtmp, mask, vec_enc);
5678   } else {
5679     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5680     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5681     vmovmskps(rtmp, mask, vec_enc);
5682   }
5683   shlq(rtmp, 5); // for 32 byte permute row.
5684   if (opcode == Op_CompressV) {
5685     lea(rscratch, ExternalAddress(compress_perm_table));
5686   } else {
5687     lea(rscratch, ExternalAddress(expand_perm_table));
5688   }
5689   addptr(rtmp, rscratch);
5690   vmovdqu(permv, Address(rtmp));
5691   vpermps(dst, permv, src, Assembler::AVX_256bit);
5692   vpxor(xtmp, xtmp, xtmp, vec_enc);
5693   // Blend the result with zero vector using permute mask, each column entry
5694   // in a permute table row contains either a valid permute index or a -1 (default)
5695   // value, this can potentially be used as a blending mask after
5696   // compressing/expanding the source vector lanes.
5697   vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv);
5698 }
5699 
5700 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5701                                                bool merge, BasicType bt, int vec_enc) {
5702   if (opcode == Op_CompressV) {
5703     switch(bt) {
5704     case T_BYTE:
5705       evpcompressb(dst, mask, src, merge, vec_enc);
5706       break;
5707     case T_CHAR:
5708     case T_SHORT:
5709       evpcompressw(dst, mask, src, merge, vec_enc);
5710       break;
5711     case T_INT:
5712       evpcompressd(dst, mask, src, merge, vec_enc);
5713       break;
5714     case T_FLOAT:
5715       evcompressps(dst, mask, src, merge, vec_enc);
5716       break;
5717     case T_LONG:
5718       evpcompressq(dst, mask, src, merge, vec_enc);
5719       break;
5720     case T_DOUBLE:
5721       evcompresspd(dst, mask, src, merge, vec_enc);
5722       break;
5723     default:
5724       fatal("Unsupported type %s", type2name(bt));
5725       break;
5726     }
5727   } else {
5728     assert(opcode == Op_ExpandV, "");
5729     switch(bt) {
5730     case T_BYTE:
5731       evpexpandb(dst, mask, src, merge, vec_enc);
5732       break;
5733     case T_CHAR:
5734     case T_SHORT:
5735       evpexpandw(dst, mask, src, merge, vec_enc);
5736       break;
5737     case T_INT:
5738       evpexpandd(dst, mask, src, merge, vec_enc);
5739       break;
5740     case T_FLOAT:
5741       evexpandps(dst, mask, src, merge, vec_enc);
5742       break;
5743     case T_LONG:
5744       evpexpandq(dst, mask, src, merge, vec_enc);
5745       break;
5746     case T_DOUBLE:
5747       evexpandpd(dst, mask, src, merge, vec_enc);
5748       break;
5749     default:
5750       fatal("Unsupported type %s", type2name(bt));
5751       break;
5752     }
5753   }
5754 }
5755 
5756 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5757                                            KRegister ktmp1, int vec_enc) {
5758   if (opcode == Op_SignumVD) {
5759     vsubpd(dst, zero, one, vec_enc);
5760     // if src < 0 ? -1 : 1
5761     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5762     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5763     // if src == NaN, -0.0 or 0.0 return src.
5764     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5765     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5766   } else {
5767     assert(opcode == Op_SignumVF, "");
5768     vsubps(dst, zero, one, vec_enc);
5769     // if src < 0 ? -1 : 1
5770     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5771     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5772     // if src == NaN, -0.0 or 0.0 return src.
5773     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5774     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5775   }
5776 }
5777 
5778 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5779                                           XMMRegister xtmp1, int vec_enc) {
5780   if (opcode == Op_SignumVD) {
5781     vsubpd(dst, zero, one, vec_enc);
5782     // if src < 0 ? -1 : 1
5783     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5784     // if src == NaN, -0.0 or 0.0 return src.
5785     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5786     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5787   } else {
5788     assert(opcode == Op_SignumVF, "");
5789     vsubps(dst, zero, one, vec_enc);
5790     // if src < 0 ? -1 : 1
5791     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5792     // if src == NaN, -0.0 or 0.0 return src.
5793     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5794     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5795   }
5796 }
5797 
5798 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5799   if (VM_Version::supports_avx512bw()) {
5800     if (mask_len > 32) {
5801       kmovql(dst, src);
5802     } else {
5803       kmovdl(dst, src);
5804       if (mask_len != 32) {
5805         kshiftrdl(dst, dst, 32 - mask_len);
5806       }
5807     }
5808   } else {
5809     assert(mask_len <= 16, "");
5810     kmovwl(dst, src);
5811     if (mask_len != 16) {
5812       kshiftrwl(dst, dst, 16 - mask_len);
5813     }
5814   }
5815 }
5816 
5817 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5818   int lane_size = type2aelembytes(bt);
5819   if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5820       (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) {
5821     movptr(rtmp, imm32);
5822     switch(lane_size) {
5823       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5824       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5825       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5826       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5827       fatal("Unsupported lane size %d", lane_size);
5828       break;
5829     }
5830   } else {
5831     movptr(rtmp, imm32);
5832     movq(dst, rtmp);
5833     switch(lane_size) {
5834       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5835       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5836       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5837       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5838       fatal("Unsupported lane size %d", lane_size);
5839       break;
5840     }
5841   }
5842 }
5843 
5844 //
5845 // Following is lookup table based popcount computation algorithm:-
5846 //       Index   Bit set count
5847 //     [ 0000 ->   0,
5848 //       0001 ->   1,
5849 //       0010 ->   1,
5850 //       0011 ->   2,
5851 //       0100 ->   1,
5852 //       0101 ->   2,
5853 //       0110 ->   2,
5854 //       0111 ->   3,
5855 //       1000 ->   1,
5856 //       1001 ->   2,
5857 //       1010 ->   3,
5858 //       1011 ->   3,
5859 //       1100 ->   2,
5860 //       1101 ->   3,
5861 //       1111 ->   4 ]
5862 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5863 //     shuffle indices for lookup table access.
5864 //  b. Right shift each byte of vector lane by 4 positions.
5865 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5866 //     shuffle indices for lookup table access.
5867 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5868 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5869 //     count of all the bytes of a quadword.
5870 //  f. Perform step e. for upper 128bit vector lane.
5871 //  g. Pack the bitset count of quadwords back to double word.
5872 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5873 
5874 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5875                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5876   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5877   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5878   vpsrlw(dst, src, 4, vec_enc);
5879   vpand(dst, dst, xtmp1, vec_enc);
5880   vpand(xtmp1, src, xtmp1, vec_enc);
5881   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5882   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5883   vpshufb(dst, xtmp2, dst, vec_enc);
5884   vpaddb(dst, dst, xtmp1, vec_enc);
5885 }
5886 
5887 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5888                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5889   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5890   // Following code is as per steps e,f,g and h of above algorithm.
5891   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5892   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5893   vpsadbw(dst, dst, xtmp2, vec_enc);
5894   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5895   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5896   vpackuswb(dst, xtmp1, dst, vec_enc);
5897 }
5898 
5899 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5900                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5901   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5902   // Add the popcount of upper and lower bytes of word.
5903   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5904   vpsrlw(dst, xtmp1, 8, vec_enc);
5905   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5906   vpaddw(dst, dst, xtmp1, vec_enc);
5907 }
5908 
5909 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5910                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5911   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5912   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5913   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5914 }
5915 
5916 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5917                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5918   switch(bt) {
5919     case T_LONG:
5920       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5921       break;
5922     case T_INT:
5923       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5924       break;
5925     case T_CHAR:
5926     case T_SHORT:
5927       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5928       break;
5929     case T_BYTE:
5930     case T_BOOLEAN:
5931       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5932       break;
5933     default:
5934       fatal("Unsupported type %s", type2name(bt));
5935       break;
5936   }
5937 }
5938 
5939 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5940                                                       KRegister mask, bool merge, int vec_enc) {
5941   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5942   switch(bt) {
5943     case T_LONG:
5944       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5945       evpopcntq(dst, mask, src, merge, vec_enc);
5946       break;
5947     case T_INT:
5948       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5949       evpopcntd(dst, mask, src, merge, vec_enc);
5950       break;
5951     case T_CHAR:
5952     case T_SHORT:
5953       assert(VM_Version::supports_avx512_bitalg(), "");
5954       evpopcntw(dst, mask, src, merge, vec_enc);
5955       break;
5956     case T_BYTE:
5957     case T_BOOLEAN:
5958       assert(VM_Version::supports_avx512_bitalg(), "");
5959       evpopcntb(dst, mask, src, merge, vec_enc);
5960       break;
5961     default:
5962       fatal("Unsupported type %s", type2name(bt));
5963       break;
5964   }
5965 }
5966 
5967 // Bit reversal algorithm first reverses the bits of each byte followed by
5968 // a byte level reversal for multi-byte primitive types (short/int/long).
5969 // Algorithm performs a lookup table access to get reverse bit sequence
5970 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5971 // is obtained by swapping the reverse bit sequences of upper and lower
5972 // nibble of a byte.
5973 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5974                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5975   if (VM_Version::supports_avx512vlbw()) {
5976 
5977     // Get the reverse bit sequence of lower nibble of each byte.
5978     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5979     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5980     evpandq(dst, xtmp2, src, vec_enc);
5981     vpshufb(dst, xtmp1, dst, vec_enc);
5982     vpsllq(dst, dst, 4, vec_enc);
5983 
5984     // Get the reverse bit sequence of upper nibble of each byte.
5985     vpandn(xtmp2, xtmp2, src, vec_enc);
5986     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5987     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5988 
5989     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5990     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5991     evporq(xtmp2, dst, xtmp2, vec_enc);
5992     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5993 
5994   } else if(vec_enc == Assembler::AVX_512bit) {
5995     // Shift based bit reversal.
5996     assert(bt == T_LONG || bt == T_INT, "");
5997 
5998     // Swap lower and upper nibble of each byte.
5999     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
6000 
6001     // Swap two least and most significant bits of each nibble.
6002     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
6003 
6004     // Swap adjacent pair of bits.
6005     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6006     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
6007 
6008     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6009     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
6010   } else {
6011     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
6012     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6013 
6014     // Get the reverse bit sequence of lower nibble of each byte.
6015     vpand(dst, xtmp2, src, vec_enc);
6016     vpshufb(dst, xtmp1, dst, vec_enc);
6017     vpsllq(dst, dst, 4, vec_enc);
6018 
6019     // Get the reverse bit sequence of upper nibble of each byte.
6020     vpandn(xtmp2, xtmp2, src, vec_enc);
6021     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6022     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6023 
6024     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6025     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6026     vpor(xtmp2, dst, xtmp2, vec_enc);
6027     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6028   }
6029 }
6030 
6031 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
6032                                                 XMMRegister xtmp, Register rscratch) {
6033   assert(VM_Version::supports_gfni(), "");
6034   assert(rscratch != noreg || always_reachable(mask), "missing");
6035 
6036   // Galois field instruction based bit reversal based on following algorithm.
6037   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6038   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
6039   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
6040   vector_reverse_byte(bt, dst, xtmp, vec_enc);
6041 }
6042 
6043 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
6044                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
6045   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
6046   evpandq(dst, xtmp1, src, vec_enc);
6047   vpsllq(dst, dst, nbits, vec_enc);
6048   vpandn(xtmp1, xtmp1, src, vec_enc);
6049   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
6050   evporq(dst, dst, xtmp1, vec_enc);
6051 }
6052 
6053 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6054                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
6055   // Shift based bit reversal.
6056   assert(VM_Version::supports_evex(), "");
6057   switch(bt) {
6058     case T_LONG:
6059       // Swap upper and lower double word of each quad word.
6060       evprorq(xtmp1, k0, src, 32, true, vec_enc);
6061       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
6062       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6063       break;
6064     case T_INT:
6065       // Swap upper and lower word of each double word.
6066       evprord(xtmp1, k0, src, 16, true, vec_enc);
6067       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6068       break;
6069     case T_CHAR:
6070     case T_SHORT:
6071       // Swap upper and lower byte of each word.
6072       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6073       break;
6074     case T_BYTE:
6075       evmovdquq(dst, k0, src, true, vec_enc);
6076       break;
6077     default:
6078       fatal("Unsupported type %s", type2name(bt));
6079       break;
6080   }
6081 }
6082 
6083 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6084   if (bt == T_BYTE) {
6085     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6086       evmovdquq(dst, k0, src, true, vec_enc);
6087     } else {
6088       vmovdqu(dst, src);
6089     }
6090     return;
6091   }
6092   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6093   // pre-computed shuffle indices.
6094   switch(bt) {
6095     case T_LONG:
6096       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6097       break;
6098     case T_INT:
6099       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6100       break;
6101     case T_CHAR:
6102     case T_SHORT:
6103       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6104       break;
6105     default:
6106       fatal("Unsupported type %s", type2name(bt));
6107       break;
6108   }
6109   vpshufb(dst, src, dst, vec_enc);
6110 }
6111 
6112 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6113                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6114                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6115   assert(is_integral_type(bt), "");
6116   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6117   assert(VM_Version::supports_avx512cd(), "");
6118   switch(bt) {
6119     case T_LONG:
6120       evplzcntq(dst, ktmp, src, merge, vec_enc);
6121       break;
6122     case T_INT:
6123       evplzcntd(dst, ktmp, src, merge, vec_enc);
6124       break;
6125     case T_SHORT:
6126       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6127       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6128       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6129       vpunpckhwd(dst, xtmp1, src, vec_enc);
6130       evplzcntd(dst, ktmp, dst, merge, vec_enc);
6131       vpackusdw(dst, xtmp2, dst, vec_enc);
6132       break;
6133     case T_BYTE:
6134       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6135       // accessing the lookup table.
6136       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6137       // accessing the lookup table.
6138       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6139       assert(VM_Version::supports_avx512bw(), "");
6140       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6141       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6142       vpand(xtmp2, dst, src, vec_enc);
6143       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6144       vpsrlw(xtmp3, src, 4, vec_enc);
6145       vpand(xtmp3, dst, xtmp3, vec_enc);
6146       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6147       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6148       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6149       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6150       break;
6151     default:
6152       fatal("Unsupported type %s", type2name(bt));
6153       break;
6154   }
6155 }
6156 
6157 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6158                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6159   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6160   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6161   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6162   // accessing the lookup table.
6163   vpand(dst, xtmp2, src, vec_enc);
6164   vpshufb(dst, xtmp1, dst, vec_enc);
6165   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6166   // accessing the lookup table.
6167   vpsrlw(xtmp3, src, 4, vec_enc);
6168   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6169   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6170   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6171   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6172   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6173   vpaddb(dst, dst, xtmp2, vec_enc);
6174   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6175 }
6176 
6177 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6178                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6179   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6180   // Add zero counts of lower byte and upper byte of a word if
6181   // upper byte holds a zero value.
6182   vpsrlw(xtmp3, src, 8, vec_enc);
6183   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6184   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6185   vpsllw(xtmp2, dst, 8, vec_enc);
6186   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6187   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6188   vpsrlw(dst, dst, 8, vec_enc);
6189 }
6190 
6191 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6192                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6193   // By converting the integer to a float, we can obtain the number of leading zeros based on the exponent of the float.
6194   // As the float exponent contains a bias of 127 for nonzero values, the bias must be removed before interpreting the
6195   // exponent as the leading zero count.
6196 
6197   // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher
6198   // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit
6199   // contributes to the leading number of zeros.
6200   vpsrld(dst, src, 1, vec_enc);
6201   vpandn(dst, dst, src, vec_enc);
6202 
6203   vcvtdq2ps(dst, dst, vec_enc);
6204 
6205   // By comparing the register to itself, all the bits in the destination are set.
6206   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6207 
6208   // Move the biased exponent to the low end of the lane and mask with 0xFF to discard the sign bit.
6209   vpsrld(xtmp2, xtmp1, 24, vec_enc);
6210   vpsrld(dst, dst, 23, vec_enc);
6211   vpand(dst, xtmp2, dst, vec_enc);
6212 
6213   // Subtract 127 from the exponent, which removes the bias from the exponent.
6214   vpsrld(xtmp2, xtmp1, 25, vec_enc);
6215   vpsubd(dst, dst, xtmp2, vec_enc);
6216 
6217   vpsrld(xtmp2, xtmp1, 27, vec_enc);
6218 
6219   // If the original value is 0 the exponent would not have bias, so the subtraction creates a negative number. If this
6220   // is found in any of the lanes, replace the lane with -1 from xtmp1.
6221   vblendvps(dst, dst, xtmp1, dst, vec_enc, true, xtmp3);
6222 
6223   // If the original value is negative, replace the lane with 31.
6224   vblendvps(dst, dst, xtmp2, src, vec_enc, true, xtmp3);
6225 
6226   // Subtract the exponent from 31, giving the final result. For 0, the result is 32 as the exponent was replaced with -1,
6227   // and for negative numbers the result is 0 as the exponent was replaced with 31.
6228   vpsubd(dst, xtmp2, dst, vec_enc);
6229 }
6230 
6231 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6232                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6233   // Find the leading zeros of the top and bottom halves of the long individually.
6234   vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6235 
6236   // Move the top half result to the bottom half of xtmp1, setting the top half to 0.
6237   vpsrlq(xtmp1, dst, 32, vec_enc);
6238   // By moving the top half result to the right by 6 bits, if the top half was empty (i.e. 32 is returned) the result bit will
6239   // be in the most significant position of the bottom half.
6240   vpsrlq(xtmp2, dst, 6, vec_enc);
6241 
6242   // In the bottom half, add the top half and bottom half results.
6243   vpaddq(dst, xtmp1, dst, vec_enc);
6244 
6245   // For the bottom half, choose between the values using the most significant bit of xtmp2.
6246   // If the MSB is set, then bottom+top in dst is the resulting value. If the top half is less than 32 xtmp1 is chosen,
6247   // which contains only the top half result.
6248   // In the top half the MSB is always zero, so the value in xtmp1 is always chosen. This value is always 0, which clears
6249   // the lane as required.
6250   vblendvps(dst, xtmp1, dst, xtmp2, vec_enc, true, xtmp3);
6251 }
6252 
6253 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6254                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6255                                                        Register rtmp, int vec_enc) {
6256   assert(is_integral_type(bt), "unexpected type");
6257   assert(vec_enc < Assembler::AVX_512bit, "");
6258   switch(bt) {
6259     case T_LONG:
6260       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6261       break;
6262     case T_INT:
6263       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6264       break;
6265     case T_SHORT:
6266       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6267       break;
6268     case T_BYTE:
6269       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6270       break;
6271     default:
6272       fatal("Unsupported type %s", type2name(bt));
6273       break;
6274   }
6275 }
6276 
6277 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6278   switch(bt) {
6279     case T_BYTE:
6280       vpsubb(dst, src1, src2, vec_enc);
6281       break;
6282     case T_SHORT:
6283       vpsubw(dst, src1, src2, vec_enc);
6284       break;
6285     case T_INT:
6286       vpsubd(dst, src1, src2, vec_enc);
6287       break;
6288     case T_LONG:
6289       vpsubq(dst, src1, src2, vec_enc);
6290       break;
6291     default:
6292       fatal("Unsupported type %s", type2name(bt));
6293       break;
6294   }
6295 }
6296 
6297 // Trailing zero count computation is based on leading zero count operation as per
6298 // following equation. All AVX3 targets support AVX512CD feature which offers
6299 // direct vector instruction to compute leading zero count.
6300 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6301 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6302                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6303                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6304   assert(is_integral_type(bt), "");
6305   // xtmp = -1
6306   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6307   // xtmp = xtmp + src
6308   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6309   // xtmp = xtmp & ~src
6310   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6311   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6312   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6313   vpsub(bt, dst, xtmp4, dst, vec_enc);
6314 }
6315 
6316 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6317 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6318 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6319                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6320   assert(is_integral_type(bt), "");
6321   // xtmp = 0
6322   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6323   // xtmp = 0 - src
6324   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6325   // xtmp = xtmp | src
6326   vpor(xtmp3, xtmp3, src, vec_enc);
6327   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6328   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6329   vpsub(bt, dst, xtmp1, dst, vec_enc);
6330 }
6331 
6332 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6333   Label done;
6334   Label neg_divisor_fastpath;
6335   cmpl(divisor, 0);
6336   jccb(Assembler::less, neg_divisor_fastpath);
6337   xorl(rdx, rdx);
6338   divl(divisor);
6339   jmpb(done);
6340   bind(neg_divisor_fastpath);
6341   // Fastpath for divisor < 0:
6342   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6343   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6344   movl(rdx, rax);
6345   subl(rdx, divisor);
6346   if (VM_Version::supports_bmi1()) {
6347     andnl(rax, rdx, rax);
6348   } else {
6349     notl(rdx);
6350     andl(rax, rdx);
6351   }
6352   shrl(rax, 31);
6353   bind(done);
6354 }
6355 
6356 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6357   Label done;
6358   Label neg_divisor_fastpath;
6359   cmpl(divisor, 0);
6360   jccb(Assembler::less, neg_divisor_fastpath);
6361   xorl(rdx, rdx);
6362   divl(divisor);
6363   jmpb(done);
6364   bind(neg_divisor_fastpath);
6365   // Fastpath when divisor < 0:
6366   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6367   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6368   movl(rdx, rax);
6369   subl(rax, divisor);
6370   if (VM_Version::supports_bmi1()) {
6371     andnl(rax, rax, rdx);
6372   } else {
6373     notl(rax);
6374     andl(rax, rdx);
6375   }
6376   sarl(rax, 31);
6377   andl(rax, divisor);
6378   subl(rdx, rax);
6379   bind(done);
6380 }
6381 
6382 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6383   Label done;
6384   Label neg_divisor_fastpath;
6385 
6386   cmpl(divisor, 0);
6387   jccb(Assembler::less, neg_divisor_fastpath);
6388   xorl(rdx, rdx);
6389   divl(divisor);
6390   jmpb(done);
6391   bind(neg_divisor_fastpath);
6392   // Fastpath for divisor < 0:
6393   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6394   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6395   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6396   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6397   movl(rdx, rax);
6398   subl(rax, divisor);
6399   if (VM_Version::supports_bmi1()) {
6400     andnl(rax, rax, rdx);
6401   } else {
6402     notl(rax);
6403     andl(rax, rdx);
6404   }
6405   movl(tmp, rax);
6406   shrl(rax, 31); // quotient
6407   sarl(tmp, 31);
6408   andl(tmp, divisor);
6409   subl(rdx, tmp); // remainder
6410   bind(done);
6411 }
6412 
6413 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6414                                  XMMRegister xtmp2, Register rtmp) {
6415   if(VM_Version::supports_gfni()) {
6416     // Galois field instruction based bit reversal based on following algorithm.
6417     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6418     mov64(rtmp, 0x8040201008040201L);
6419     movq(xtmp1, src);
6420     movq(xtmp2, rtmp);
6421     gf2p8affineqb(xtmp1, xtmp2, 0);
6422     movq(dst, xtmp1);
6423   } else {
6424     // Swap even and odd numbered bits.
6425     movl(rtmp, src);
6426     andl(rtmp, 0x55555555);
6427     shll(rtmp, 1);
6428     movl(dst, src);
6429     andl(dst, 0xAAAAAAAA);
6430     shrl(dst, 1);
6431     orl(dst, rtmp);
6432 
6433     // Swap LSB and MSB 2 bits of each nibble.
6434     movl(rtmp, dst);
6435     andl(rtmp, 0x33333333);
6436     shll(rtmp, 2);
6437     andl(dst, 0xCCCCCCCC);
6438     shrl(dst, 2);
6439     orl(dst, rtmp);
6440 
6441     // Swap LSB and MSB 4 bits of each byte.
6442     movl(rtmp, dst);
6443     andl(rtmp, 0x0F0F0F0F);
6444     shll(rtmp, 4);
6445     andl(dst, 0xF0F0F0F0);
6446     shrl(dst, 4);
6447     orl(dst, rtmp);
6448   }
6449   bswapl(dst);
6450 }
6451 
6452 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6453                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6454   if(VM_Version::supports_gfni()) {
6455     // Galois field instruction based bit reversal based on following algorithm.
6456     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6457     mov64(rtmp1, 0x8040201008040201L);
6458     movq(xtmp1, src);
6459     movq(xtmp2, rtmp1);
6460     gf2p8affineqb(xtmp1, xtmp2, 0);
6461     movq(dst, xtmp1);
6462   } else {
6463     // Swap even and odd numbered bits.
6464     movq(rtmp1, src);
6465     mov64(rtmp2, 0x5555555555555555L);
6466     andq(rtmp1, rtmp2);
6467     shlq(rtmp1, 1);
6468     movq(dst, src);
6469     notq(rtmp2);
6470     andq(dst, rtmp2);
6471     shrq(dst, 1);
6472     orq(dst, rtmp1);
6473 
6474     // Swap LSB and MSB 2 bits of each nibble.
6475     movq(rtmp1, dst);
6476     mov64(rtmp2, 0x3333333333333333L);
6477     andq(rtmp1, rtmp2);
6478     shlq(rtmp1, 2);
6479     notq(rtmp2);
6480     andq(dst, rtmp2);
6481     shrq(dst, 2);
6482     orq(dst, rtmp1);
6483 
6484     // Swap LSB and MSB 4 bits of each byte.
6485     movq(rtmp1, dst);
6486     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6487     andq(rtmp1, rtmp2);
6488     shlq(rtmp1, 4);
6489     notq(rtmp2);
6490     andq(dst, rtmp2);
6491     shrq(dst, 4);
6492     orq(dst, rtmp1);
6493   }
6494   bswapq(dst);
6495 }
6496 
6497 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6498   Label done;
6499   Label neg_divisor_fastpath;
6500   cmpq(divisor, 0);
6501   jccb(Assembler::less, neg_divisor_fastpath);
6502   xorl(rdx, rdx);
6503   divq(divisor);
6504   jmpb(done);
6505   bind(neg_divisor_fastpath);
6506   // Fastpath for divisor < 0:
6507   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6508   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6509   movq(rdx, rax);
6510   subq(rdx, divisor);
6511   if (VM_Version::supports_bmi1()) {
6512     andnq(rax, rdx, rax);
6513   } else {
6514     notq(rdx);
6515     andq(rax, rdx);
6516   }
6517   shrq(rax, 63);
6518   bind(done);
6519 }
6520 
6521 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6522   Label done;
6523   Label neg_divisor_fastpath;
6524   cmpq(divisor, 0);
6525   jccb(Assembler::less, neg_divisor_fastpath);
6526   xorq(rdx, rdx);
6527   divq(divisor);
6528   jmp(done);
6529   bind(neg_divisor_fastpath);
6530   // Fastpath when divisor < 0:
6531   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6532   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6533   movq(rdx, rax);
6534   subq(rax, divisor);
6535   if (VM_Version::supports_bmi1()) {
6536     andnq(rax, rax, rdx);
6537   } else {
6538     notq(rax);
6539     andq(rax, rdx);
6540   }
6541   sarq(rax, 63);
6542   andq(rax, divisor);
6543   subq(rdx, rax);
6544   bind(done);
6545 }
6546 
6547 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6548   Label done;
6549   Label neg_divisor_fastpath;
6550   cmpq(divisor, 0);
6551   jccb(Assembler::less, neg_divisor_fastpath);
6552   xorq(rdx, rdx);
6553   divq(divisor);
6554   jmp(done);
6555   bind(neg_divisor_fastpath);
6556   // Fastpath for divisor < 0:
6557   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6558   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6559   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6560   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6561   movq(rdx, rax);
6562   subq(rax, divisor);
6563   if (VM_Version::supports_bmi1()) {
6564     andnq(rax, rax, rdx);
6565   } else {
6566     notq(rax);
6567     andq(rax, rdx);
6568   }
6569   movq(tmp, rax);
6570   shrq(rax, 63); // quotient
6571   sarq(tmp, 63);
6572   andq(tmp, divisor);
6573   subq(rdx, tmp); // remainder
6574   bind(done);
6575 }
6576 
6577 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6578                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6579                                         int vlen_enc) {
6580   assert(VM_Version::supports_avx512bw(), "");
6581   // Byte shuffles are inlane operations and indices are determined using
6582   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6583   // normalized to index range 0-15. This makes sure that all the multiples
6584   // of an index value are placed at same relative position in 128 bit
6585   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6586   // will be 16th element in their respective 128 bit lanes.
6587   movl(rtmp, 16);
6588   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6589 
6590   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6591   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6592   // original shuffle indices and move the shuffled lanes corresponding to true
6593   // mask to destination vector.
6594   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6595   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6596   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6597 
6598   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6599   // and broadcasting second 128 bit lane.
6600   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6601   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6602   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6603   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6604   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6605 
6606   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6607   // and broadcasting third 128 bit lane.
6608   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6609   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6610   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6611   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6612   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6613 
6614   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6615   // and broadcasting third 128 bit lane.
6616   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6617   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6618   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6619   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6620   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6621 }
6622 
6623 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6624                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6625   if (vlen_enc == AVX_128bit) {
6626     vpermilps(dst, src, shuffle, vlen_enc);
6627   } else if (bt == T_INT) {
6628     vpermd(dst, shuffle, src, vlen_enc);
6629   } else {
6630     assert(bt == T_FLOAT, "");
6631     vpermps(dst, shuffle, src, vlen_enc);
6632   }
6633 }
6634 
6635 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6636   switch(opcode) {
6637     case Op_AddHF: vaddsh(dst, src1, src2); break;
6638     case Op_SubHF: vsubsh(dst, src1, src2); break;
6639     case Op_MulHF: vmulsh(dst, src1, src2); break;
6640     case Op_DivHF: vdivsh(dst, src1, src2); break;
6641     default: assert(false, "%s", NodeClassNames[opcode]); break;
6642   }
6643 }
6644 
6645 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6646   switch(elem_bt) {
6647     case T_BYTE:
6648       if (ideal_opc == Op_SaturatingAddV) {
6649         vpaddsb(dst, src1, src2, vlen_enc);
6650       } else {
6651         assert(ideal_opc == Op_SaturatingSubV, "");
6652         vpsubsb(dst, src1, src2, vlen_enc);
6653       }
6654       break;
6655     case T_SHORT:
6656       if (ideal_opc == Op_SaturatingAddV) {
6657         vpaddsw(dst, src1, src2, vlen_enc);
6658       } else {
6659         assert(ideal_opc == Op_SaturatingSubV, "");
6660         vpsubsw(dst, src1, src2, vlen_enc);
6661       }
6662       break;
6663     default:
6664       fatal("Unsupported type %s", type2name(elem_bt));
6665       break;
6666   }
6667 }
6668 
6669 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6670   switch(elem_bt) {
6671     case T_BYTE:
6672       if (ideal_opc == Op_SaturatingAddV) {
6673         vpaddusb(dst, src1, src2, vlen_enc);
6674       } else {
6675         assert(ideal_opc == Op_SaturatingSubV, "");
6676         vpsubusb(dst, src1, src2, vlen_enc);
6677       }
6678       break;
6679     case T_SHORT:
6680       if (ideal_opc == Op_SaturatingAddV) {
6681         vpaddusw(dst, src1, src2, vlen_enc);
6682       } else {
6683         assert(ideal_opc == Op_SaturatingSubV, "");
6684         vpsubusw(dst, src1, src2, vlen_enc);
6685       }
6686       break;
6687     default:
6688       fatal("Unsupported type %s", type2name(elem_bt));
6689       break;
6690   }
6691 }
6692 
6693 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6694                                                               XMMRegister src2, KRegister ktmp, int vlen_enc) {
6695   // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6696   // overflow_mask = Inp1 <u Inp2
6697   evpcmpu(elem_bt, ktmp,  src2, src1, Assembler::lt, vlen_enc);
6698   // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6699   evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6700 }
6701 
6702 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6703                                                               XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6704   // Emulate unsigned comparison using signed comparison
6705   // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6706   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6707   vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6708   vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6709 
6710   vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6711 
6712   // Res = INP1 - INP2 (non-commutative and non-associative)
6713   vpsub(elem_bt, dst, src1, src2, vlen_enc);
6714   // Res = Mask ? Zero : Res
6715   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6716   vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6717 }
6718 
6719 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6720                                                                XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6721   // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6722   // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6723   // Res = Signed Add INP1, INP2
6724   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6725   // T1 = SRC1 | SRC2
6726   vpor(xtmp1, src1, src2, vlen_enc);
6727   // Max_Unsigned = -1
6728   vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6729   // Unsigned compare:  Mask = Res <u T1
6730   evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6731   // res  = Mask ? Max_Unsigned : Res
6732   evpblend(elem_bt, dst, ktmp,  dst, xtmp2, true, vlen_enc);
6733 }
6734 
6735 //
6736 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6737 // unsigned addition operation.
6738 //    overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6739 //
6740 // We empirically determined its semantic equivalence to following reduced expression
6741 //    overflow_mask =  (a + b) <u (a | b)
6742 //
6743 // and also verified it though Alive2 solver.
6744 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6745 //
6746 
6747 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6748                                                               XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6749   // Res = Signed Add INP1, INP2
6750   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6751   // Compute T1 = INP1 | INP2
6752   vpor(xtmp3, src1, src2, vlen_enc);
6753   // T1 = Minimum signed value.
6754   vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6755   // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6756   vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6757   // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6758   vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6759   // Compute overflow detection mask = Res<1> <s T1
6760   if (elem_bt == T_INT) {
6761     vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6762   } else {
6763     assert(elem_bt == T_LONG, "");
6764     vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6765   }
6766   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6767 }
6768 
6769 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6770                                       int vlen_enc, bool xtmp2_hold_M1) {
6771   if (VM_Version::supports_avx512dq()) {
6772     evpmovq2m(ktmp, src, vlen_enc);
6773   } else {
6774     assert(VM_Version::supports_evex(), "");
6775     if (!xtmp2_hold_M1) {
6776       vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6777     }
6778     evpsraq(xtmp1, src, 63, vlen_enc);
6779     evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6780   }
6781 }
6782 
6783 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6784                                       int vlen_enc, bool xtmp2_hold_M1) {
6785   if (VM_Version::supports_avx512dq()) {
6786     evpmovd2m(ktmp, src, vlen_enc);
6787   } else {
6788     assert(VM_Version::supports_evex(), "");
6789     if (!xtmp2_hold_M1) {
6790       vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6791     }
6792     vpsrad(xtmp1, src, 31, vlen_enc);
6793     Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6794   }
6795 }
6796 
6797 
6798 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6799   if (elem_bt == T_LONG) {
6800     if (VM_Version::supports_evex()) {
6801       evpsraq(dst, src, 63, vlen_enc);
6802     } else {
6803       vpsrad(dst, src, 31, vlen_enc);
6804       vpshufd(dst, dst, 0xF5, vlen_enc);
6805     }
6806   } else {
6807     assert(elem_bt == T_INT, "");
6808     vpsrad(dst, src, 31, vlen_enc);
6809   }
6810 }
6811 
6812 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6813   if (compute_allones) {
6814     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6815       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6816     } else {
6817       vpcmpeqq(allones, allones, allones, vlen_enc);
6818     }
6819   }
6820   if (elem_bt == T_LONG) {
6821     vpsrlq(dst, allones, 1, vlen_enc);
6822   } else {
6823     assert(elem_bt == T_INT, "");
6824     vpsrld(dst, allones, 1, vlen_enc);
6825   }
6826 }
6827 
6828 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6829   if (compute_allones) {
6830     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6831       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6832     } else {
6833       vpcmpeqq(allones, allones, allones, vlen_enc);
6834     }
6835   }
6836   if (elem_bt == T_LONG) {
6837     vpsllq(dst, allones, 63, vlen_enc);
6838   } else {
6839     assert(elem_bt == T_INT, "");
6840     vpslld(dst, allones, 31, vlen_enc);
6841   }
6842 }
6843 
6844 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask,  XMMRegister src1, XMMRegister src2,
6845                                 Assembler::ComparisonPredicate cond, int vlen_enc) {
6846   switch(elem_bt) {
6847     case T_LONG:  evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6848     case T_INT:   evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6849     case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6850     case T_BYTE:  evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6851     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6852   }
6853 }
6854 
6855 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6856   switch(elem_bt) {
6857     case  T_LONG:  vpcmpgtq(dst, src1, src2, vlen_enc); break;
6858     case  T_INT:   vpcmpgtd(dst, src1, src2, vlen_enc); break;
6859     case  T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6860     case  T_BYTE:  vpcmpgtb(dst, src1, src2, vlen_enc); break;
6861     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6862   }
6863 }
6864 
6865 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6866                                            XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6867   if (elem_bt == T_LONG) {
6868     evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6869   } else {
6870     assert(elem_bt == T_INT, "");
6871     evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6872   }
6873 }
6874 
6875 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6876                                                          XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6877                                                          KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6878   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6879   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6880   // Overflow detection based on Hacker's delight section 2-13.
6881   if (ideal_opc == Op_SaturatingAddV) {
6882     // res = src1 + src2
6883     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6884     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6885     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6886     vpxor(xtmp1, dst, src1, vlen_enc);
6887     vpxor(xtmp2, dst, src2, vlen_enc);
6888     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6889   } else {
6890     assert(ideal_opc == Op_SaturatingSubV, "");
6891     // res = src1 - src2
6892     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6893     // Overflow occurs when both inputs have opposite polarity and
6894     // result polarity does not comply with first input polarity.
6895     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6896     vpxor(xtmp1, src1, src2, vlen_enc);
6897     vpxor(xtmp2, dst, src1, vlen_enc);
6898     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6899   }
6900 
6901   // Compute overflow detection mask.
6902   evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6903   // Note: xtmp1 hold -1 in all its lanes after above call.
6904 
6905   // Compute mask based on first input polarity.
6906   evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6907 
6908   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6909   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6910 
6911   // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6912   // set bits in first input polarity mask holds a min value.
6913   evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6914   // Blend destination lanes with saturated values using overflow detection mask.
6915   evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6916 }
6917 
6918 
6919 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6920                                                         XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6921                                                         XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6922   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6923   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6924   // Overflow detection based on Hacker's delight section 2-13.
6925   if (ideal_opc == Op_SaturatingAddV) {
6926     // res = src1 + src2
6927     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6928     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6929     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6930     vpxor(xtmp1, dst, src1, vlen_enc);
6931     vpxor(xtmp2, dst, src2, vlen_enc);
6932     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6933   } else {
6934     assert(ideal_opc == Op_SaturatingSubV, "");
6935     // res = src1 - src2
6936     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6937     // Overflow occurs when both inputs have opposite polarity and
6938     // result polarity does not comply with first input polarity.
6939     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6940     vpxor(xtmp1, src1, src2, vlen_enc);
6941     vpxor(xtmp2, dst, src1, vlen_enc);
6942     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6943   }
6944 
6945   // Sign-extend to compute overflow detection mask.
6946   vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
6947 
6948   vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
6949   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
6950   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6951 
6952   // Compose saturating min/max vector using first input polarity mask.
6953   vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
6954   vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
6955 
6956   // Blend result with saturating vector using overflow detection mask.
6957   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6958 }
6959 
6960 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6961   switch(elem_bt) {
6962     case T_BYTE:
6963       if (ideal_opc == Op_SaturatingAddV) {
6964         vpaddsb(dst, src1, src2, vlen_enc);
6965       } else {
6966         assert(ideal_opc == Op_SaturatingSubV, "");
6967         vpsubsb(dst, src1, src2, vlen_enc);
6968       }
6969       break;
6970     case T_SHORT:
6971       if (ideal_opc == Op_SaturatingAddV) {
6972         vpaddsw(dst, src1, src2, vlen_enc);
6973       } else {
6974         assert(ideal_opc == Op_SaturatingSubV, "");
6975         vpsubsw(dst, src1, src2, vlen_enc);
6976       }
6977       break;
6978     default:
6979       fatal("Unsupported type %s", type2name(elem_bt));
6980       break;
6981   }
6982 }
6983 
6984 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6985   switch(elem_bt) {
6986     case T_BYTE:
6987       if (ideal_opc == Op_SaturatingAddV) {
6988         vpaddusb(dst, src1, src2, vlen_enc);
6989       } else {
6990         assert(ideal_opc == Op_SaturatingSubV, "");
6991         vpsubusb(dst, src1, src2, vlen_enc);
6992       }
6993       break;
6994     case T_SHORT:
6995       if (ideal_opc == Op_SaturatingAddV) {
6996         vpaddusw(dst, src1, src2, vlen_enc);
6997       } else {
6998         assert(ideal_opc == Op_SaturatingSubV, "");
6999         vpsubusw(dst, src1, src2, vlen_enc);
7000       }
7001       break;
7002     default:
7003       fatal("Unsupported type %s", type2name(elem_bt));
7004       break;
7005   }
7006 }
7007 
7008 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
7009                                                      XMMRegister src2, int vlen_enc) {
7010   switch(elem_bt) {
7011     case T_BYTE:
7012       evpermi2b(dst, src1, src2, vlen_enc);
7013       break;
7014     case T_SHORT:
7015       evpermi2w(dst, src1, src2, vlen_enc);
7016       break;
7017     case T_INT:
7018       evpermi2d(dst, src1, src2, vlen_enc);
7019       break;
7020     case T_LONG:
7021       evpermi2q(dst, src1, src2, vlen_enc);
7022       break;
7023     case T_FLOAT:
7024       evpermi2ps(dst, src1, src2, vlen_enc);
7025       break;
7026     case T_DOUBLE:
7027       evpermi2pd(dst, src1, src2, vlen_enc);
7028       break;
7029     default:
7030       fatal("Unsupported type %s", type2name(elem_bt));
7031       break;
7032   }
7033 }
7034 
7035 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
7036   if (is_unsigned) {
7037     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7038   } else {
7039     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7040   }
7041 }
7042 
7043 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
7044   if (is_unsigned) {
7045     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7046   } else {
7047     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7048   }
7049 }
7050 
7051 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
7052   switch(opcode) {
7053     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7054     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7055     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7056     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7057     default: assert(false, "%s", NodeClassNames[opcode]); break;
7058   }
7059 }
7060 
7061 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7062   switch(opcode) {
7063     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7064     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7065     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7066     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7067     default: assert(false, "%s", NodeClassNames[opcode]); break;
7068   }
7069 }
7070 
7071 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7072                                             KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) {
7073   vector_max_min_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit);
7074 }
7075 
7076 void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7077                                             KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
7078   if (opcode == Op_MaxVHF || opcode == Op_MaxHF) {
7079     // Move sign bits of src2 to mask register.
7080     evpmovw2m(ktmp, src2, vlen_enc);
7081     // xtmp1 = src2 < 0 ? src2 : src1
7082     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7083     // xtmp2 = src2 < 0 ? ? src1 : src2
7084     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7085     // Idea behind above swapping is to make seconds source operand a +ve value.
7086     // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in
7087     // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction,
7088     // the second source operand, either a NaN or a valid floating-point value, is returned
7089     // dst = max(xtmp1, xtmp2)
7090     evmaxph(dst, xtmp1, xtmp2, vlen_enc);
7091     // isNaN = is_unordered_quiet(xtmp1)
7092     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7093     // Final result is same as first source if its a NaN value,
7094     // in case second operand holds a NaN value then as per above semantics
7095     // result is same as second operand.
7096     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7097   } else {
7098     assert(opcode == Op_MinVHF || opcode == Op_MinHF, "");
7099     // Move sign bits of src1 to mask register.
7100     evpmovw2m(ktmp, src1, vlen_enc);
7101     // xtmp1 = src1 < 0 ? src2 : src1
7102     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7103     // xtmp2 = src1 < 0 ? src1 : src2
7104     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7105     // Idea behind above swapping is to make seconds source operand a -ve value.
7106     // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in
7107     // the second source operand is returned.
7108     // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN
7109     // or a valid floating-point value, is written to the result.
7110     // dst = min(xtmp1, xtmp2)
7111     evminph(dst, xtmp1, xtmp2, vlen_enc);
7112     // isNaN = is_unordered_quiet(xtmp1)
7113     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7114     // Final result is same as first source if its a NaN value,
7115     // in case second operand holds a NaN value then as per above semantics
7116     // result is same as second operand.
7117     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7118   }
7119 }