1 /*
   2  * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "gc/shared/barrierSet.hpp"
  28 #include "gc/shared/barrierSetAssembler.hpp"
  29 #include "oops/methodData.hpp"
  30 #include "opto/c2_MacroAssembler.hpp"
  31 #include "opto/intrinsicnode.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/opcodes.hpp"
  34 #include "opto/subnode.hpp"
  35 #include "runtime/globals.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/objectMonitorTable.hpp"
  38 #include "runtime/stubRoutines.hpp"
  39 #include "runtime/synchronizer.hpp"
  40 #include "utilities/checkedCast.hpp"
  41 #include "utilities/globalDefinitions.hpp"
  42 #include "utilities/powerOfTwo.hpp"
  43 #include "utilities/sizes.hpp"
  44 
  45 #ifdef PRODUCT
  46 #define BLOCK_COMMENT(str) /* nothing */
  47 #define STOP(error) stop(error)
  48 #else
  49 #define BLOCK_COMMENT(str) block_comment(str)
  50 #define STOP(error) block_comment(error); stop(error)
  51 #endif
  52 
  53 // C2 compiled method's prolog code.
  54 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
  55   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  56 
  57   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  58   // Remove word for return addr
  59   framesize -= wordSize;
  60   stack_bang_size -= wordSize;
  61 
  62   // Calls to C2R adapters often do not accept exceptional returns.
  63   // We require that their callers must bang for them.  But be careful, because
  64   // some VM calls (such as call site linkage) can use several kilobytes of
  65   // stack.  But the stack safety zone should account for that.
  66   // See bugs 4446381, 4468289, 4497237.
  67   if (stack_bang_size > 0) {
  68     generate_stack_overflow_check(stack_bang_size);
  69 
  70     // We always push rbp, so that on return to interpreter rbp, will be
  71     // restored correctly and we can correct the stack.
  72     push(rbp);
  73     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  74     if (PreserveFramePointer) {
  75       mov(rbp, rsp);
  76     }
  77     // Remove word for ebp
  78     framesize -= wordSize;
  79 
  80     // Create frame
  81     if (framesize) {
  82       subptr(rsp, framesize);
  83     }
  84   } else {
  85     subptr(rsp, framesize);
  86 
  87     // Save RBP register now.
  88     framesize -= wordSize;
  89     movptr(Address(rsp, framesize), rbp);
  90     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  91     if (PreserveFramePointer) {
  92       movptr(rbp, rsp);
  93       if (framesize > 0) {
  94         addptr(rbp, framesize);
  95       }
  96     }
  97   }
  98 
  99   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 100     framesize -= wordSize;
 101     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 102   }
 103 
 104 #ifdef ASSERT
 105   if (VerifyStackAtCalls) {
 106     Label L;
 107     push(rax);
 108     mov(rax, rsp);
 109     andptr(rax, StackAlignmentInBytes-1);
 110     cmpptr(rax, StackAlignmentInBytes-wordSize);
 111     pop(rax);
 112     jcc(Assembler::equal, L);
 113     STOP("Stack is not properly aligned!");
 114     bind(L);
 115   }
 116 #endif
 117 
 118   if (!is_stub) {
 119     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 120     // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 121     Label dummy_slow_path;
 122     Label dummy_continuation;
 123     Label* slow_path = &dummy_slow_path;
 124     Label* continuation = &dummy_continuation;
 125     if (!Compile::current()->output()->in_scratch_emit_size()) {
 126       // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 127       C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 128       Compile::current()->output()->add_stub(stub);
 129       slow_path = &stub->entry();
 130       continuation = &stub->continuation();
 131     }
 132     bs->nmethod_entry_barrier(this, slow_path, continuation);
 133   }
 134 }
 135 
 136 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 137   switch (vlen_in_bytes) {
 138     case  4: // fall-through
 139     case  8: // fall-through
 140     case 16: return Assembler::AVX_128bit;
 141     case 32: return Assembler::AVX_256bit;
 142     case 64: return Assembler::AVX_512bit;
 143 
 144     default: {
 145       ShouldNotReachHere();
 146       return Assembler::AVX_NoVec;
 147     }
 148   }
 149 }
 150 
 151 // fast_lock and fast_unlock used by C2
 152 
 153 // Because the transitions from emitted code to the runtime
 154 // monitorenter/exit helper stubs are so slow it's critical that
 155 // we inline both the lock-stack fast path and the inflated fast path.
 156 //
 157 // See also: cmpFastLock and cmpFastUnlock.
 158 //
 159 // What follows is a specialized inline transliteration of the code
 160 // in enter() and exit(). If we're concerned about I$ bloat another
 161 // option would be to emit TrySlowEnter and TrySlowExit methods
 162 // at startup-time.  These methods would accept arguments as
 163 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 164 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 165 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 166 // In practice, however, the # of lock sites is bounded and is usually small.
 167 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 168 // if the processor uses simple bimodal branch predictors keyed by EIP
 169 // Since the helper routines would be called from multiple synchronization
 170 // sites.
 171 //
 172 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 173 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 174 // to those specialized methods.  That'd give us a mostly platform-independent
 175 // implementation that the JITs could optimize and inline at their pleasure.
 176 // Done correctly, the only time we'd need to cross to native could would be
 177 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 178 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 179 // (b) explicit barriers or fence operations.
 180 //
 181 // TODO:
 182 //
 183 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 184 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 185 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 186 //    the lock operators would typically be faster than reifying Self.
 187 //
 188 // *  Ideally I'd define the primitives as:
 189 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 190 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 191 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 192 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 193 //    Furthermore the register assignments are overconstrained, possibly resulting in
 194 //    sub-optimal code near the synchronization site.
 195 //
 196 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 197 //    Alternately, use a better sp-proximity test.
 198 //
 199 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 200 //    Either one is sufficient to uniquely identify a thread.
 201 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 202 //
 203 // *  Intrinsify notify() and notifyAll() for the common cases where the
 204 //    object is locked by the calling thread but the waitlist is empty.
 205 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 206 //
 207 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 208 //    But beware of excessive branch density on AMD Opterons.
 209 //
 210 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 211 //    or failure of the fast path.  If the fast path fails then we pass
 212 //    control to the slow path, typically in C.  In fast_lock and
 213 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 214 //    will emit a conditional branch immediately after the node.
 215 //    So we have branches to branches and lots of ICC.ZF games.
 216 //    Instead, it might be better to have C2 pass a "FailureLabel"
 217 //    into fast_lock and fast_unlock.  In the case of success, control
 218 //    will drop through the node.  ICC.ZF is undefined at exit.
 219 //    In the case of failure, the node will branch directly to the
 220 //    FailureLabel
 221 
 222 // obj: object to lock
 223 // box: on-stack box address -- KILLED
 224 // rax: tmp -- KILLED
 225 // t  : tmp -- KILLED
 226 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register rax_reg,
 227                                   Register t, Register thread) {
 228   assert(rax_reg == rax, "Used for CAS");
 229   assert_different_registers(obj, box, rax_reg, t, thread);
 230 
 231   // Handle inflated monitor.
 232   Label inflated;
 233   // Finish fast lock successfully. ZF value is irrelevant.
 234   Label locked;
 235   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 236   Label slow_path;
 237 
 238   if (UseObjectMonitorTable) {
 239     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 240     movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
 241   }
 242 
 243   if (DiagnoseSyncOnValueBasedClasses != 0) {
 244     load_klass(rax_reg, obj, t);
 245     testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 246     jcc(Assembler::notZero, slow_path);
 247   }
 248 
 249   const Register mark = t;
 250 
 251   { // Fast Lock
 252 
 253     Label push;
 254 
 255     const Register top = UseObjectMonitorTable ? rax_reg : box;
 256 
 257     // Load the mark.
 258     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 259 
 260     // Prefetch top.
 261     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 262 
 263     // Check for monitor (0b10).
 264     testptr(mark, markWord::monitor_value);
 265     jcc(Assembler::notZero, inflated);
 266 
 267     // Check if lock-stack is full.
 268     cmpl(top, LockStack::end_offset() - 1);
 269     jcc(Assembler::greater, slow_path);
 270 
 271     // Check if recursive.
 272     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 273     jccb(Assembler::equal, push);
 274 
 275     // Try to lock. Transition lock bits 0b01 => 0b00
 276     movptr(rax_reg, mark);
 277     orptr(rax_reg, markWord::unlocked_value);
 278     andptr(mark, ~(int32_t)markWord::unlocked_value);
 279     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 280     jcc(Assembler::notEqual, slow_path);
 281 
 282     if (UseObjectMonitorTable) {
 283       // Need to reload top, clobbered by CAS.
 284       movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 285     }
 286     bind(push);
 287     // After successful lock, push object on lock-stack.
 288     movptr(Address(thread, top), obj);
 289     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 290     jmp(locked);
 291   }
 292 
 293   { // Handle inflated monitor.
 294     bind(inflated);
 295 
 296     const Register monitor = t;
 297 
 298     if (!UseObjectMonitorTable) {
 299       assert(mark == monitor, "should be the same here");
 300     } else {
 301       const Register hash = t;
 302       Label monitor_found;
 303 
 304       // Look for the monitor in the om_cache.
 305 
 306       ByteSize cache_offset   = JavaThread::om_cache_oops_offset();
 307       ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
 308       const int num_unrolled  = OMCache::CAPACITY;
 309       for (int i = 0; i < num_unrolled; i++) {
 310         movptr(monitor, Address(thread,  cache_offset + monitor_offset));
 311         cmpptr(obj, Address(thread, cache_offset));
 312         jccb(Assembler::equal, monitor_found);
 313         cache_offset = cache_offset + OMCache::oop_to_oop_difference();
 314       }
 315 
 316       // Look for the monitor in the table.
 317 
 318       // Get the hash code.
 319       movptr(hash, Address(obj, oopDesc::mark_offset_in_bytes()));
 320       shrq(hash, markWord::hash_shift);
 321       andq(hash, markWord::hash_mask);
 322 
 323       // Get the table and calculate the bucket's address.
 324       lea(rax_reg, ExternalAddress(ObjectMonitorTable::current_table_address()));
 325       movptr(rax_reg, Address(rax_reg));
 326       andq(hash, Address(rax_reg, ObjectMonitorTable::table_capacity_mask_offset()));
 327       movptr(rax_reg, Address(rax_reg, ObjectMonitorTable::table_buckets_offset()));
 328 
 329       // Read the monitor from the bucket.
 330       movptr(monitor, Address(rax_reg, hash, Address::times_ptr));
 331 
 332       // Check if the monitor in the bucket is special (empty, tombstone or removed)
 333       cmpptr(monitor, ObjectMonitorTable::SpecialPointerValues::below_is_special);
 334       jcc(Assembler::below, slow_path);
 335 
 336       // Check if object matches.
 337       movptr(rax_reg, Address(monitor, ObjectMonitor::object_offset()));
 338       BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 339       bs_asm->try_resolve_weak_handle_in_c2(this, rax_reg, slow_path);
 340       cmpptr(rax_reg, obj);
 341       jcc(Assembler::notEqual, slow_path);
 342 
 343       bind(monitor_found);
 344     }
 345     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 346     const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 347     const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
 348 
 349     Label monitor_locked;
 350     // Lock the monitor.
 351 
 352     if (UseObjectMonitorTable) {
 353       // Cache the monitor for unlock before trashing box. On failure to acquire
 354       // the lock, the slow path will reset the entry accordingly (see CacheSetter).
 355       movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
 356     }
 357 
 358     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 359     xorptr(rax_reg, rax_reg);
 360     movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
 361     lock(); cmpxchgptr(box, owner_address);
 362     jccb(Assembler::equal, monitor_locked);
 363 
 364     // Check if recursive.
 365     cmpptr(box, rax_reg);
 366     jccb(Assembler::notEqual, slow_path);
 367 
 368     // Recursive.
 369     increment(recursions_address);
 370 
 371     bind(monitor_locked);
 372   }
 373 
 374   bind(locked);
 375   // Set ZF = 1
 376   xorl(rax_reg, rax_reg);
 377 
 378 #ifdef ASSERT
 379   // Check that locked label is reached with ZF set.
 380   Label zf_correct;
 381   Label zf_bad_zero;
 382   jcc(Assembler::zero, zf_correct);
 383   jmp(zf_bad_zero);
 384 #endif
 385 
 386   bind(slow_path);
 387 #ifdef ASSERT
 388   // Check that slow_path label is reached with ZF not set.
 389   jcc(Assembler::notZero, zf_correct);
 390   stop("Fast Lock ZF != 0");
 391   bind(zf_bad_zero);
 392   stop("Fast Lock ZF != 1");
 393   bind(zf_correct);
 394 #endif
 395   // C2 uses the value of ZF to determine the continuation.
 396 }
 397 
 398 // obj: object to lock
 399 // rax: tmp -- KILLED
 400 // t  : tmp - cannot be obj nor rax -- KILLED
 401 //
 402 // Some commentary on balanced locking:
 403 //
 404 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 405 // Methods that don't have provably balanced locking are forced to run in the
 406 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 407 // The interpreter provides two properties:
 408 // I1:  At return-time the interpreter automatically and quietly unlocks any
 409 //      objects acquired in the current activation (frame).  Recall that the
 410 //      interpreter maintains an on-stack list of locks currently held by
 411 //      a frame.
 412 // I2:  If a method attempts to unlock an object that is not held by the
 413 //      frame the interpreter throws IMSX.
 414 //
 415 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 416 // B() doesn't have provably balanced locking so it runs in the interpreter.
 417 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 418 // is still locked by A().
 419 //
 420 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface
 421 // Specification" states that an object locked by JNI's MonitorEnter should not be
 422 // unlocked by "normal" java-level locking and vice-versa.  The specification doesn't
 423 // specify what will occur if a program engages in such mixed-mode locking, however.
 424 // Arguably given that the spec legislates the JNI case as undefined our implementation
 425 // could reasonably *avoid* checking owner in fast_unlock().
 426 // In the interest of performance we elide m->Owner==Self check in unlock.
 427 // A perfectly viable alternative is to elide the owner check except when
 428 // Xcheck:jni is enabled.
 429 
 430 void C2_MacroAssembler::fast_unlock(Register obj, Register reg_rax, Register t, Register thread) {
 431   assert(reg_rax == rax, "Used for CAS");
 432   assert_different_registers(obj, reg_rax, t);
 433 
 434   // Handle inflated monitor.
 435   Label inflated, inflated_check_lock_stack;
 436   // Finish fast unlock successfully.  MUST jump with ZF == 1
 437   Label unlocked, slow_path;
 438 
 439   const Register mark = t;
 440   const Register monitor = t;
 441   const Register top = UseObjectMonitorTable ? t : reg_rax;
 442   const Register box = reg_rax;
 443 
 444   Label dummy;
 445   C2FastUnlockStub* stub = nullptr;
 446 
 447   if (!Compile::current()->output()->in_scratch_emit_size()) {
 448     stub = new (Compile::current()->comp_arena()) C2FastUnlockStub(obj, mark, reg_rax, thread);
 449     Compile::current()->output()->add_stub(stub);
 450   }
 451 
 452   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 453 
 454   { // Fast Unlock
 455 
 456     // Load top.
 457     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 458 
 459     if (!UseObjectMonitorTable) {
 460       // Prefetch mark.
 461       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 462     }
 463 
 464     // Check if obj is top of lock-stack.
 465     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 466     // Top of lock stack was not obj. Must be monitor.
 467     jcc(Assembler::notEqual, inflated_check_lock_stack);
 468 
 469     // Pop lock-stack.
 470     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 471     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 472 
 473     // Check if recursive.
 474     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 475     jcc(Assembler::equal, unlocked);
 476 
 477     // We elide the monitor check, let the CAS fail instead.
 478 
 479     if (UseObjectMonitorTable) {
 480       // Load mark.
 481       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 482     }
 483 
 484     // Try to unlock. Transition lock bits 0b00 => 0b01
 485     movptr(reg_rax, mark);
 486     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
 487     orptr(mark, markWord::unlocked_value);
 488     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 489     jcc(Assembler::notEqual, push_and_slow_path);
 490     jmp(unlocked);
 491   }
 492 
 493 
 494   { // Handle inflated monitor.
 495     bind(inflated_check_lock_stack);
 496 #ifdef ASSERT
 497     Label check_done;
 498     subl(top, oopSize);
 499     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
 500     jcc(Assembler::below, check_done);
 501     cmpptr(obj, Address(thread, top));
 502     jcc(Assembler::notEqual, inflated_check_lock_stack);
 503     stop("Fast Unlock lock on stack");
 504     bind(check_done);
 505     if (UseObjectMonitorTable) {
 506       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 507     }
 508     testptr(mark, markWord::monitor_value);
 509     jcc(Assembler::notZero, inflated);
 510     stop("Fast Unlock not monitor");
 511 #endif
 512 
 513     bind(inflated);
 514 
 515     if (!UseObjectMonitorTable) {
 516       assert(mark == monitor, "should be the same here");
 517     } else {
 518       // Uses ObjectMonitorTable.  Look for the monitor in our BasicLock on the stack.
 519       movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 520       // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
 521       cmpptr(monitor, alignof(ObjectMonitor*));
 522       jcc(Assembler::below, slow_path);
 523     }
 524     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 525     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
 526     const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
 527     const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag};
 528     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
 529 
 530     Label recursive;
 531 
 532     // Check if recursive.
 533     cmpptr(recursions_address, 0);
 534     jcc(Assembler::notZero, recursive);
 535 
 536     // Set owner to null.
 537     // Release to satisfy the JMM
 538     movptr(owner_address, NULL_WORD);
 539     // We need a full fence after clearing owner to avoid stranding.
 540     // StoreLoad achieves this.
 541     membar(StoreLoad);
 542 
 543     // Check if the entry_list is empty.
 544     cmpptr(entry_list_address, NULL_WORD);
 545     jcc(Assembler::zero, unlocked);    // If so we are done.
 546 
 547     // Check if there is a successor.
 548     cmpptr(succ_address, NULL_WORD);
 549     jcc(Assembler::notZero, unlocked); // If so we are done.
 550 
 551     // Save the monitor pointer in the current thread, so we can try to
 552     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 553     if (!UseObjectMonitorTable) {
 554       andptr(monitor, ~(int32_t)markWord::monitor_value);
 555     }
 556     movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
 557 
 558     orl(t, 1); // Fast Unlock ZF = 0
 559     jmpb(slow_path);
 560 
 561     // Recursive unlock.
 562     bind(recursive);
 563     decrement(recursions_address);
 564   }
 565 
 566   bind(unlocked);
 567   xorl(t, t); // Fast Unlock ZF = 1
 568 
 569 #ifdef ASSERT
 570   // Check that unlocked label is reached with ZF set.
 571   Label zf_correct;
 572   Label zf_bad_zero;
 573   jcc(Assembler::zero, zf_correct);
 574   jmp(zf_bad_zero);
 575 #endif
 576 
 577   bind(slow_path);
 578   if (stub != nullptr) {
 579     bind(stub->slow_path_continuation());
 580   }
 581 #ifdef ASSERT
 582   // Check that stub->continuation() label is reached with ZF not set.
 583   jcc(Assembler::notZero, zf_correct);
 584   stop("Fast Unlock ZF != 0");
 585   bind(zf_bad_zero);
 586   stop("Fast Unlock ZF != 1");
 587   bind(zf_correct);
 588 #endif
 589   // C2 uses the value of ZF to determine the continuation.
 590 }
 591 
 592 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
 593   fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
 594 }
 595 
 596 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) {
 597   const int framesize = Compile::current()->output()->frame_size_in_bytes();
 598   masm->movptr(dst, rsp);
 599   if (framesize > 2 * wordSize) {
 600     masm->addptr(dst, framesize - 2 * wordSize);
 601   }
 602 }
 603 
 604 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
 605   if (PreserveFramePointer) {
 606     // frame pointer is valid
 607 #ifdef ASSERT
 608     // Verify frame pointer value in rbp.
 609     reconstruct_frame_pointer_helper(this, rtmp);
 610     Label L_success;
 611     cmpq(rbp, rtmp);
 612     jccb(Assembler::equal, L_success);
 613     STOP("frame pointer mismatch");
 614     bind(L_success);
 615 #endif // ASSERT
 616   } else {
 617     reconstruct_frame_pointer_helper(this, rbp);
 618   }
 619 }
 620 
 621 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) {
 622   jint lo = t->_lo;
 623   jint hi = t->_hi;
 624   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi);
 625   if (t == TypeInt::INT) {
 626     return;
 627   }
 628 
 629   BLOCK_COMMENT("CastII {");
 630   Label fail;
 631   Label succeed;
 632 
 633   if (lo != min_jint) {
 634     cmpl(val, lo);
 635     jccb(Assembler::less, fail);
 636   }
 637   if (hi != max_jint) {
 638     cmpl(val, hi);
 639     jccb(Assembler::greater, fail);
 640   }
 641   jmpb(succeed);
 642 
 643   bind(fail);
 644   movl(c_rarg0, idx);
 645   movl(c_rarg1, val);
 646   movl(c_rarg2, lo);
 647   movl(c_rarg3, hi);
 648   reconstruct_frame_pointer(rscratch1);
 649   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range)));
 650   hlt();
 651   bind(succeed);
 652   BLOCK_COMMENT("} // CastII");
 653 }
 654 
 655 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
 656   fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
 657 }
 658 
 659 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) {
 660   jlong lo = t->_lo;
 661   jlong hi = t->_hi;
 662   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi);
 663   if (t == TypeLong::LONG) {
 664     return;
 665   }
 666 
 667   BLOCK_COMMENT("CastLL {");
 668   Label fail;
 669   Label succeed;
 670 
 671   auto cmp_val = [&](jlong bound) {
 672     if (is_simm32(bound)) {
 673       cmpq(val, checked_cast<int>(bound));
 674     } else {
 675       mov64(tmp, bound);
 676       cmpq(val, tmp);
 677     }
 678   };
 679 
 680   if (lo != min_jlong) {
 681     cmp_val(lo);
 682     jccb(Assembler::less, fail);
 683   }
 684   if (hi != max_jlong) {
 685     cmp_val(hi);
 686     jccb(Assembler::greater, fail);
 687   }
 688   jmpb(succeed);
 689 
 690   bind(fail);
 691   movl(c_rarg0, idx);
 692   movq(c_rarg1, val);
 693   mov64(c_rarg2, lo);
 694   mov64(c_rarg3, hi);
 695   reconstruct_frame_pointer(rscratch1);
 696   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range)));
 697   hlt();
 698   bind(succeed);
 699   BLOCK_COMMENT("} // CastLL");
 700 }
 701 
 702 //-------------------------------------------------------------------------------------------
 703 // Generic instructions support for use in .ad files C2 code generation
 704 
 705 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 706   if (dst != src) {
 707     movdqu(dst, src);
 708   }
 709   if (opcode == Op_AbsVD) {
 710     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 711   } else {
 712     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 713     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 714   }
 715 }
 716 
 717 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 718   if (opcode == Op_AbsVD) {
 719     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 720   } else {
 721     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 722     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 723   }
 724 }
 725 
 726 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 727   if (dst != src) {
 728     movdqu(dst, src);
 729   }
 730   if (opcode == Op_AbsVF) {
 731     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 732   } else {
 733     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 734     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 735   }
 736 }
 737 
 738 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 739   if (opcode == Op_AbsVF) {
 740     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 741   } else {
 742     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 743     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 744   }
 745 }
 746 
 747 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 748   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 749   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 750 
 751   if (opcode == Op_MinV) {
 752     if (elem_bt == T_BYTE) {
 753       pminsb(dst, src);
 754     } else if (elem_bt == T_SHORT) {
 755       pminsw(dst, src);
 756     } else if (elem_bt == T_INT) {
 757       pminsd(dst, src);
 758     } else {
 759       assert(elem_bt == T_LONG, "required");
 760       assert(tmp == xmm0, "required");
 761       assert_different_registers(dst, src, tmp);
 762       movdqu(xmm0, dst);
 763       pcmpgtq(xmm0, src);
 764       blendvpd(dst, src);  // xmm0 as mask
 765     }
 766   } else { // opcode == Op_MaxV
 767     if (elem_bt == T_BYTE) {
 768       pmaxsb(dst, src);
 769     } else if (elem_bt == T_SHORT) {
 770       pmaxsw(dst, src);
 771     } else if (elem_bt == T_INT) {
 772       pmaxsd(dst, src);
 773     } else {
 774       assert(elem_bt == T_LONG, "required");
 775       assert(tmp == xmm0, "required");
 776       assert_different_registers(dst, src, tmp);
 777       movdqu(xmm0, src);
 778       pcmpgtq(xmm0, dst);
 779       blendvpd(dst, src);  // xmm0 as mask
 780     }
 781   }
 782 }
 783 
 784 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 785                                   XMMRegister src1, Address src2, int vlen_enc) {
 786   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 787   if (opcode == Op_UMinV) {
 788     switch(elem_bt) {
 789       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 790       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 791       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 792       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 793       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 794     }
 795   } else {
 796     assert(opcode == Op_UMaxV, "required");
 797     switch(elem_bt) {
 798       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 799       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 800       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 801       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 802       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 803     }
 804   }
 805 }
 806 
 807 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
 808   // For optimality, leverage a full vector width of 512 bits
 809   // for operations over smaller vector sizes on AVX512 targets.
 810   if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 811     if (opcode == Op_UMaxV) {
 812       evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 813     } else {
 814       assert(opcode == Op_UMinV, "required");
 815       evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 816     }
 817   } else {
 818     // T1 = -1
 819     vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
 820     // T1 = -1 << 63
 821     vpsllq(xtmp1, xtmp1, 63, vlen_enc);
 822     // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
 823     vpaddq(xtmp2, xtmp1, src2, vlen_enc);
 824     // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
 825     vpaddq(xtmp1, xtmp1, src1, vlen_enc);
 826     // Mask = T2 > T1
 827     vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
 828     if (opcode == Op_UMaxV) {
 829       // Res = Mask ? Src2 : Src1
 830       vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
 831     } else {
 832       // Res = Mask ? Src1 : Src2
 833       vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
 834     }
 835   }
 836 }
 837 
 838 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 839                                   XMMRegister src1, XMMRegister src2, int vlen_enc) {
 840   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 841   if (opcode == Op_UMinV) {
 842     switch(elem_bt) {
 843       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 844       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 845       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 846       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 847       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 848     }
 849   } else {
 850     assert(opcode == Op_UMaxV, "required");
 851     switch(elem_bt) {
 852       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 853       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 854       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 855       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 856       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 857     }
 858   }
 859 }
 860 
 861 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 862                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 863                                  int vlen_enc) {
 864   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 865 
 866   if (opcode == Op_MinV) {
 867     if (elem_bt == T_BYTE) {
 868       vpminsb(dst, src1, src2, vlen_enc);
 869     } else if (elem_bt == T_SHORT) {
 870       vpminsw(dst, src1, src2, vlen_enc);
 871     } else if (elem_bt == T_INT) {
 872       vpminsd(dst, src1, src2, vlen_enc);
 873     } else {
 874       assert(elem_bt == T_LONG, "required");
 875       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 876         vpminsq(dst, src1, src2, vlen_enc);
 877       } else {
 878         assert_different_registers(dst, src1, src2);
 879         vpcmpgtq(dst, src1, src2, vlen_enc);
 880         vblendvpd(dst, src1, src2, dst, vlen_enc);
 881       }
 882     }
 883   } else { // opcode == Op_MaxV
 884     if (elem_bt == T_BYTE) {
 885       vpmaxsb(dst, src1, src2, vlen_enc);
 886     } else if (elem_bt == T_SHORT) {
 887       vpmaxsw(dst, src1, src2, vlen_enc);
 888     } else if (elem_bt == T_INT) {
 889       vpmaxsd(dst, src1, src2, vlen_enc);
 890     } else {
 891       assert(elem_bt == T_LONG, "required");
 892       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 893         vpmaxsq(dst, src1, src2, vlen_enc);
 894       } else {
 895         assert_different_registers(dst, src1, src2);
 896         vpcmpgtq(dst, src1, src2, vlen_enc);
 897         vblendvpd(dst, src2, src1, dst, vlen_enc);
 898       }
 899     }
 900   }
 901 }
 902 
 903 // Float/Double min max
 904 
 905 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
 906                                    XMMRegister dst, XMMRegister a, XMMRegister b,
 907                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
 908                                    int vlen_enc) {
 909   assert(UseAVX > 0, "required");
 910   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 911          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 912   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 913   assert_different_registers(a, tmp, atmp, btmp);
 914   assert_different_registers(b, tmp, atmp, btmp);
 915 
 916   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 917   bool is_double_word = is_double_word_type(elem_bt);
 918 
 919   /* Note on 'non-obvious' assembly sequence:
 920    *
 921    * While there are vminps/vmaxps instructions, there are two important differences between hardware
 922    * and Java on how they handle floats:
 923    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
 924    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
 925    *
 926    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
 927    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
 928    *                (only useful when signs differ, noop otherwise)
 929    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
 930 
 931    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
 932    *   btmp = (b < +0.0) ? a : b
 933    *   atmp = (b < +0.0) ? b : a
 934    *   Tmp  = Max_Float(atmp , btmp)
 935    *   Res  = (atmp == NaN) ? atmp : Tmp
 936    */
 937 
 938   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
 939   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
 940   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
 941   XMMRegister mask;
 942 
 943   if (!is_double_word && is_min) {
 944     mask = a;
 945     vblend = &MacroAssembler::vblendvps;
 946     vmaxmin = &MacroAssembler::vminps;
 947     vcmp = &MacroAssembler::vcmpps;
 948   } else if (!is_double_word && !is_min) {
 949     mask = b;
 950     vblend = &MacroAssembler::vblendvps;
 951     vmaxmin = &MacroAssembler::vmaxps;
 952     vcmp = &MacroAssembler::vcmpps;
 953   } else if (is_double_word && is_min) {
 954     mask = a;
 955     vblend = &MacroAssembler::vblendvpd;
 956     vmaxmin = &MacroAssembler::vminpd;
 957     vcmp = &MacroAssembler::vcmppd;
 958   } else {
 959     assert(is_double_word && !is_min, "sanity");
 960     mask = b;
 961     vblend = &MacroAssembler::vblendvpd;
 962     vmaxmin = &MacroAssembler::vmaxpd;
 963     vcmp = &MacroAssembler::vcmppd;
 964   }
 965 
 966   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
 967   XMMRegister maxmin, scratch;
 968   if (dst == btmp) {
 969     maxmin = btmp;
 970     scratch = tmp;
 971   } else {
 972     maxmin = tmp;
 973     scratch = btmp;
 974   }
 975 
 976   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
 977   if (precompute_mask && !is_double_word) {
 978     vpsrad(tmp, mask, 32, vlen_enc);
 979     mask = tmp;
 980   } else if (precompute_mask && is_double_word) {
 981     vpxor(tmp, tmp, tmp, vlen_enc);
 982     vpcmpgtq(tmp, tmp, mask, vlen_enc);
 983     mask = tmp;
 984   }
 985 
 986   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
 987   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
 988   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
 989   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 990   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
 991 }
 992 
 993 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
 994                                     XMMRegister dst, XMMRegister a, XMMRegister b,
 995                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
 996                                     int vlen_enc) {
 997   assert(UseAVX > 2, "required");
 998   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 999          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1000   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1001   assert_different_registers(dst, a, atmp, btmp);
1002   assert_different_registers(dst, b, atmp, btmp);
1003 
1004   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1005   bool is_double_word = is_double_word_type(elem_bt);
1006   bool merge = true;
1007 
1008   if (!is_double_word && is_min) {
1009     evpmovd2m(ktmp, a, vlen_enc);
1010     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1011     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1012     vminps(dst, atmp, btmp, vlen_enc);
1013     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1014     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1015   } else if (!is_double_word && !is_min) {
1016     evpmovd2m(ktmp, b, vlen_enc);
1017     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1018     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1019     vmaxps(dst, atmp, btmp, vlen_enc);
1020     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1021     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1022   } else if (is_double_word && is_min) {
1023     evpmovq2m(ktmp, a, vlen_enc);
1024     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1025     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1026     vminpd(dst, atmp, btmp, vlen_enc);
1027     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1028     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1029   } else {
1030     assert(is_double_word && !is_min, "sanity");
1031     evpmovq2m(ktmp, b, vlen_enc);
1032     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1033     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1034     vmaxpd(dst, atmp, btmp, vlen_enc);
1035     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1036     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1037   }
1038 }
1039 
1040 void C2_MacroAssembler::vminmax_fp_avx10_2(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1041                                            XMMRegister src1, XMMRegister src2, int vlen_enc) {
1042   assert(opc == Op_MinV || opc == Op_MinReductionV ||
1043          opc == Op_MaxV || opc == Op_MaxReductionV, "sanity");
1044 
1045   int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_2_MINMAX_MIN_COMPARE_SIGN
1046                                                          : AVX10_2_MINMAX_MAX_COMPARE_SIGN;
1047   if (elem_bt == T_FLOAT) {
1048     evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc);
1049   } else {
1050     assert(elem_bt == T_DOUBLE, "");
1051     evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc);
1052   }
1053 }
1054 
1055 void C2_MacroAssembler::sminmax_fp_avx10_2(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1056                                            XMMRegister src1, XMMRegister src2) {
1057   assert(opc == Op_MinF || opc == Op_MaxF ||
1058          opc == Op_MinD || opc == Op_MaxD, "sanity");
1059 
1060   int imm8 = (opc == Op_MinF || opc == Op_MinD) ? AVX10_2_MINMAX_MIN_COMPARE_SIGN
1061                                                 : AVX10_2_MINMAX_MAX_COMPARE_SIGN;
1062   if (elem_bt == T_FLOAT) {
1063     evminmaxss(dst, mask, src1, src2, true, imm8);
1064   } else {
1065     assert(elem_bt == T_DOUBLE, "");
1066     evminmaxsd(dst, mask, src1, src2, true, imm8);
1067   }
1068 }
1069 
1070 // Float/Double signum
1071 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1072   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1073 
1074   Label DONE_LABEL;
1075 
1076   // Handle special cases +0.0/-0.0 and NaN, if argument is +0.0/-0.0 or NaN, return argument
1077   // If AVX10.2 (or newer) floating point comparison instructions used, SF=1 for equal and unordered cases
1078   // If other floating point comparison instructions used, ZF=1 for equal and unordered cases
1079   if (opcode == Op_SignumF) {
1080     if (VM_Version::supports_avx10_2()) {
1081       evucomxss(dst, zero);
1082       jcc(Assembler::negative, DONE_LABEL);
1083     } else {
1084       ucomiss(dst, zero);
1085       jcc(Assembler::equal, DONE_LABEL);
1086     }
1087     movflt(dst, one);
1088     jcc(Assembler::above, DONE_LABEL);
1089     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1090   } else if (opcode == Op_SignumD) {
1091     if (VM_Version::supports_avx10_2()) {
1092       evucomxsd(dst, zero);
1093       jcc(Assembler::negative, DONE_LABEL);
1094     } else {
1095       ucomisd(dst, zero);
1096       jcc(Assembler::equal, DONE_LABEL);
1097     }
1098     movdbl(dst, one);
1099     jcc(Assembler::above, DONE_LABEL);
1100     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1101   }
1102 
1103   bind(DONE_LABEL);
1104 }
1105 
1106 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1107   if (sign) {
1108     pmovsxbw(dst, src);
1109   } else {
1110     pmovzxbw(dst, src);
1111   }
1112 }
1113 
1114 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1115   if (sign) {
1116     vpmovsxbw(dst, src, vector_len);
1117   } else {
1118     vpmovzxbw(dst, src, vector_len);
1119   }
1120 }
1121 
1122 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1123   if (sign) {
1124     vpmovsxbd(dst, src, vector_len);
1125   } else {
1126     vpmovzxbd(dst, src, vector_len);
1127   }
1128 }
1129 
1130 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1131   if (sign) {
1132     vpmovsxwd(dst, src, vector_len);
1133   } else {
1134     vpmovzxwd(dst, src, vector_len);
1135   }
1136 }
1137 
1138 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1139                                      int shift, int vector_len) {
1140   if (opcode == Op_RotateLeftV) {
1141     if (etype == T_INT) {
1142       evprold(dst, src, shift, vector_len);
1143     } else {
1144       assert(etype == T_LONG, "expected type T_LONG");
1145       evprolq(dst, src, shift, vector_len);
1146     }
1147   } else {
1148     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1149     if (etype == T_INT) {
1150       evprord(dst, src, shift, vector_len);
1151     } else {
1152       assert(etype == T_LONG, "expected type T_LONG");
1153       evprorq(dst, src, shift, vector_len);
1154     }
1155   }
1156 }
1157 
1158 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1159                                      XMMRegister shift, int vector_len) {
1160   if (opcode == Op_RotateLeftV) {
1161     if (etype == T_INT) {
1162       evprolvd(dst, src, shift, vector_len);
1163     } else {
1164       assert(etype == T_LONG, "expected type T_LONG");
1165       evprolvq(dst, src, shift, vector_len);
1166     }
1167   } else {
1168     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1169     if (etype == T_INT) {
1170       evprorvd(dst, src, shift, vector_len);
1171     } else {
1172       assert(etype == T_LONG, "expected type T_LONG");
1173       evprorvq(dst, src, shift, vector_len);
1174     }
1175   }
1176 }
1177 
1178 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1179   if (opcode == Op_RShiftVI) {
1180     psrad(dst, shift);
1181   } else if (opcode == Op_LShiftVI) {
1182     pslld(dst, shift);
1183   } else {
1184     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1185     psrld(dst, shift);
1186   }
1187 }
1188 
1189 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1190   switch (opcode) {
1191     case Op_RShiftVI:  psrad(dst, shift); break;
1192     case Op_LShiftVI:  pslld(dst, shift); break;
1193     case Op_URShiftVI: psrld(dst, shift); break;
1194 
1195     default: assert(false, "%s", NodeClassNames[opcode]);
1196   }
1197 }
1198 
1199 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1200   if (opcode == Op_RShiftVI) {
1201     vpsrad(dst, nds, shift, vector_len);
1202   } else if (opcode == Op_LShiftVI) {
1203     vpslld(dst, nds, shift, vector_len);
1204   } else {
1205     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1206     vpsrld(dst, nds, shift, vector_len);
1207   }
1208 }
1209 
1210 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1211   switch (opcode) {
1212     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1213     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1214     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1215 
1216     default: assert(false, "%s", NodeClassNames[opcode]);
1217   }
1218 }
1219 
1220 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1221   switch (opcode) {
1222     case Op_RShiftVB:  // fall-through
1223     case Op_RShiftVS:  psraw(dst, shift); break;
1224 
1225     case Op_LShiftVB:  // fall-through
1226     case Op_LShiftVS:  psllw(dst, shift);   break;
1227 
1228     case Op_URShiftVS: // fall-through
1229     case Op_URShiftVB: psrlw(dst, shift);  break;
1230 
1231     default: assert(false, "%s", NodeClassNames[opcode]);
1232   }
1233 }
1234 
1235 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1236   switch (opcode) {
1237     case Op_RShiftVB:  // fall-through
1238     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1239 
1240     case Op_LShiftVB:  // fall-through
1241     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1242 
1243     case Op_URShiftVS: // fall-through
1244     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1245 
1246     default: assert(false, "%s", NodeClassNames[opcode]);
1247   }
1248 }
1249 
1250 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1251   switch (opcode) {
1252     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1253     case Op_LShiftVL:  psllq(dst, shift); break;
1254     case Op_URShiftVL: psrlq(dst, shift); break;
1255 
1256     default: assert(false, "%s", NodeClassNames[opcode]);
1257   }
1258 }
1259 
1260 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1261   if (opcode == Op_RShiftVL) {
1262     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1263   } else if (opcode == Op_LShiftVL) {
1264     psllq(dst, shift);
1265   } else {
1266     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1267     psrlq(dst, shift);
1268   }
1269 }
1270 
1271 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1272   switch (opcode) {
1273     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1274     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1275     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1276 
1277     default: assert(false, "%s", NodeClassNames[opcode]);
1278   }
1279 }
1280 
1281 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1282   if (opcode == Op_RShiftVL) {
1283     evpsraq(dst, nds, shift, vector_len);
1284   } else if (opcode == Op_LShiftVL) {
1285     vpsllq(dst, nds, shift, vector_len);
1286   } else {
1287     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1288     vpsrlq(dst, nds, shift, vector_len);
1289   }
1290 }
1291 
1292 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1293   switch (opcode) {
1294     case Op_RShiftVB:  // fall-through
1295     case Op_RShiftVS:  // fall-through
1296     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1297 
1298     case Op_LShiftVB:  // fall-through
1299     case Op_LShiftVS:  // fall-through
1300     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1301 
1302     case Op_URShiftVB: // fall-through
1303     case Op_URShiftVS: // fall-through
1304     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1305 
1306     default: assert(false, "%s", NodeClassNames[opcode]);
1307   }
1308 }
1309 
1310 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1311   switch (opcode) {
1312     case Op_RShiftVB:  // fall-through
1313     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1314 
1315     case Op_LShiftVB:  // fall-through
1316     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1317 
1318     case Op_URShiftVB: // fall-through
1319     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1320 
1321     default: assert(false, "%s", NodeClassNames[opcode]);
1322   }
1323 }
1324 
1325 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1326   assert(UseAVX >= 2, "required");
1327   switch (opcode) {
1328     case Op_RShiftVL: {
1329       if (UseAVX > 2) {
1330         assert(tmp == xnoreg, "not used");
1331         if (!VM_Version::supports_avx512vl()) {
1332           vlen_enc = Assembler::AVX_512bit;
1333         }
1334         evpsravq(dst, src, shift, vlen_enc);
1335       } else {
1336         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1337         vpsrlvq(dst, src, shift, vlen_enc);
1338         vpsrlvq(tmp, tmp, shift, vlen_enc);
1339         vpxor(dst, dst, tmp, vlen_enc);
1340         vpsubq(dst, dst, tmp, vlen_enc);
1341       }
1342       break;
1343     }
1344     case Op_LShiftVL: {
1345       assert(tmp == xnoreg, "not used");
1346       vpsllvq(dst, src, shift, vlen_enc);
1347       break;
1348     }
1349     case Op_URShiftVL: {
1350       assert(tmp == xnoreg, "not used");
1351       vpsrlvq(dst, src, shift, vlen_enc);
1352       break;
1353     }
1354     default: assert(false, "%s", NodeClassNames[opcode]);
1355   }
1356 }
1357 
1358 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1359 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1360   assert(opcode == Op_LShiftVB ||
1361          opcode == Op_RShiftVB ||
1362          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1363   bool sign = (opcode != Op_URShiftVB);
1364   assert(vector_len == 0, "required");
1365   vextendbd(sign, dst, src, 1);
1366   vpmovzxbd(vtmp, shift, 1);
1367   varshiftd(opcode, dst, dst, vtmp, 1);
1368   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1369   vextracti128_high(vtmp, dst);
1370   vpackusdw(dst, dst, vtmp, 0);
1371 }
1372 
1373 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1374 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1375   assert(opcode == Op_LShiftVB ||
1376          opcode == Op_RShiftVB ||
1377          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1378   bool sign = (opcode != Op_URShiftVB);
1379   int ext_vector_len = vector_len + 1;
1380   vextendbw(sign, dst, src, ext_vector_len);
1381   vpmovzxbw(vtmp, shift, ext_vector_len);
1382   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1383   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1384   if (vector_len == 0) {
1385     vextracti128_high(vtmp, dst);
1386     vpackuswb(dst, dst, vtmp, vector_len);
1387   } else {
1388     vextracti64x4_high(vtmp, dst);
1389     vpackuswb(dst, dst, vtmp, vector_len);
1390     vpermq(dst, dst, 0xD8, vector_len);
1391   }
1392 }
1393 
1394 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1395   switch(typ) {
1396     case T_BYTE:
1397       pinsrb(dst, val, idx);
1398       break;
1399     case T_SHORT:
1400       pinsrw(dst, val, idx);
1401       break;
1402     case T_INT:
1403       pinsrd(dst, val, idx);
1404       break;
1405     case T_LONG:
1406       pinsrq(dst, val, idx);
1407       break;
1408     default:
1409       assert(false,"Should not reach here.");
1410       break;
1411   }
1412 }
1413 
1414 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1415   switch(typ) {
1416     case T_BYTE:
1417       vpinsrb(dst, src, val, idx);
1418       break;
1419     case T_SHORT:
1420       vpinsrw(dst, src, val, idx);
1421       break;
1422     case T_INT:
1423       vpinsrd(dst, src, val, idx);
1424       break;
1425     case T_LONG:
1426       vpinsrq(dst, src, val, idx);
1427       break;
1428     default:
1429       assert(false,"Should not reach here.");
1430       break;
1431   }
1432 }
1433 
1434 void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst,
1435                                          Register base, Register idx_base,
1436                                          Register mask, Register mask_idx,
1437                                          Register rtmp, int vlen_enc) {
1438   vpxor(dst, dst, dst, vlen_enc);
1439   if (elem_bt == T_SHORT) {
1440     for (int i = 0; i < 4; i++) {
1441       // dst[i] = mask[i] ? src[idx_base[i]] : 0
1442       Label skip_load;
1443       btq(mask, mask_idx);
1444       jccb(Assembler::carryClear, skip_load);
1445       movl(rtmp, Address(idx_base, i * 4));
1446       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1447       bind(skip_load);
1448       incq(mask_idx);
1449     }
1450   } else {
1451     assert(elem_bt == T_BYTE, "");
1452     for (int i = 0; i < 8; i++) {
1453       // dst[i] = mask[i] ? src[idx_base[i]] : 0
1454       Label skip_load;
1455       btq(mask, mask_idx);
1456       jccb(Assembler::carryClear, skip_load);
1457       movl(rtmp, Address(idx_base, i * 4));
1458       pinsrb(dst, Address(base, rtmp), i);
1459       bind(skip_load);
1460       incq(mask_idx);
1461     }
1462   }
1463 }
1464 
1465 void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst,
1466                                   Register base, Register idx_base,
1467                                   Register rtmp, int vlen_enc) {
1468   vpxor(dst, dst, dst, vlen_enc);
1469   if (elem_bt == T_SHORT) {
1470     for (int i = 0; i < 4; i++) {
1471       // dst[i] = src[idx_base[i]]
1472       movl(rtmp, Address(idx_base, i * 4));
1473       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1474     }
1475   } else {
1476     assert(elem_bt == T_BYTE, "");
1477     for (int i = 0; i < 8; i++) {
1478       // dst[i] = src[idx_base[i]]
1479       movl(rtmp, Address(idx_base, i * 4));
1480       pinsrb(dst, Address(base, rtmp), i);
1481     }
1482   }
1483 }
1484 
1485 /*
1486  * Gather using hybrid algorithm, first partially unroll scalar loop
1487  * to accumulate values from gather indices into a quad-word(64bit) slice.
1488  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1489  * permutation to place the slice into appropriate vector lane
1490  * locations in destination vector. Following pseudo code describes the
1491  * algorithm in detail:
1492  *
1493  * DST_VEC = ZERO_VEC
1494  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1495  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1496  * FOREACH_ITER:
1497  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1498  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1499  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1500  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1501  *
1502  * With each iteration, doubleword permute indices (0,1) corresponding
1503  * to gathered quadword gets right shifted by two lane positions.
1504  *
1505  */
1506 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1507                                         Register base, Register idx_base,
1508                                         Register mask, XMMRegister xtmp1,
1509                                         XMMRegister xtmp2, XMMRegister temp_dst,
1510                                         Register rtmp, Register mask_idx,
1511                                         Register length, int vector_len, int vlen_enc) {
1512   Label GATHER8_LOOP;
1513   assert(is_subword_type(elem_ty), "");
1514   movl(length, vector_len);
1515   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1516   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1517   vallones(xtmp2, vlen_enc);
1518   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1519   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1520   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1521 
1522   bind(GATHER8_LOOP);
1523     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1524     if (mask == noreg) {
1525       vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc);
1526     } else {
1527       vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc);
1528     }
1529     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1530     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1531     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1532     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1533     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1534     vpor(dst, dst, temp_dst, vlen_enc);
1535     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1536     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1537     jcc(Assembler::notEqual, GATHER8_LOOP);
1538 }
1539 
1540 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1541   switch(typ) {
1542     case T_INT:
1543       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1544       break;
1545     case T_FLOAT:
1546       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1547       break;
1548     case T_LONG:
1549       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1550       break;
1551     case T_DOUBLE:
1552       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1553       break;
1554     default:
1555       assert(false,"Should not reach here.");
1556       break;
1557   }
1558 }
1559 
1560 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1561   switch(typ) {
1562     case T_INT:
1563       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1564       break;
1565     case T_FLOAT:
1566       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1567       break;
1568     case T_LONG:
1569       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1570       break;
1571     case T_DOUBLE:
1572       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1573       break;
1574     default:
1575       assert(false,"Should not reach here.");
1576       break;
1577   }
1578 }
1579 
1580 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1581   switch(typ) {
1582     case T_INT:
1583       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1584       break;
1585     case T_FLOAT:
1586       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1587       break;
1588     case T_LONG:
1589       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1590       break;
1591     case T_DOUBLE:
1592       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1593       break;
1594     default:
1595       assert(false,"Should not reach here.");
1596       break;
1597   }
1598 }
1599 
1600 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1601   if (vlen_in_bytes <= 16) {
1602     pxor (dst, dst);
1603     psubb(dst, src);
1604     switch (elem_bt) {
1605       case T_BYTE:   /* nothing to do */ break;
1606       case T_SHORT:  pmovsxbw(dst, dst); break;
1607       case T_INT:    pmovsxbd(dst, dst); break;
1608       case T_FLOAT:  pmovsxbd(dst, dst); break;
1609       case T_LONG:   pmovsxbq(dst, dst); break;
1610       case T_DOUBLE: pmovsxbq(dst, dst); break;
1611 
1612       default: assert(false, "%s", type2name(elem_bt));
1613     }
1614   } else {
1615     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1616     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1617 
1618     vpxor (dst, dst, dst, vlen_enc);
1619     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1620 
1621     switch (elem_bt) {
1622       case T_BYTE:   /* nothing to do */            break;
1623       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1624       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1625       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1626       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1627       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1628 
1629       default: assert(false, "%s", type2name(elem_bt));
1630     }
1631   }
1632 }
1633 
1634 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1635   if (novlbwdq) {
1636     vpmovsxbd(xtmp, src, vlen_enc);
1637     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1638             Assembler::eq, true, vlen_enc, noreg);
1639   } else {
1640     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1641     vpsubb(xtmp, xtmp, src, vlen_enc);
1642     evpmovb2m(dst, xtmp, vlen_enc);
1643   }
1644 }
1645 
1646 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1647   if (is_integral_type(bt)) {
1648     switch (vlen_in_bytes) {
1649       case 4:  movdl(dst, src);   break;
1650       case 8:  movq(dst, src);    break;
1651       case 16: movdqu(dst, src);  break;
1652       case 32: vmovdqu(dst, src); break;
1653       case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1654       default: ShouldNotReachHere();
1655     }
1656   } else {
1657     switch (vlen_in_bytes) {
1658       case 4:  movflt(dst, src); break;
1659       case 8:  movdbl(dst, src); break;
1660       case 16: movups(dst, src); break;
1661       case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1662       case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1663       default: ShouldNotReachHere();
1664     }
1665   }
1666 }
1667 
1668 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1669   assert(rscratch != noreg || always_reachable(src), "missing");
1670 
1671   if (reachable(src)) {
1672     load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1673   } else {
1674     lea(rscratch, src);
1675     load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1676   }
1677 }
1678 
1679 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1680   int vlen_enc = vector_length_encoding(vlen);
1681   if (VM_Version::supports_avx()) {
1682     if (bt == T_LONG) {
1683       if (VM_Version::supports_avx2()) {
1684         vpbroadcastq(dst, src, vlen_enc);
1685       } else {
1686         vmovddup(dst, src, vlen_enc);
1687       }
1688     } else if (bt == T_DOUBLE) {
1689       if (vlen_enc != Assembler::AVX_128bit) {
1690         vbroadcastsd(dst, src, vlen_enc, noreg);
1691       } else {
1692         vmovddup(dst, src, vlen_enc);
1693       }
1694     } else {
1695       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1696         vpbroadcastd(dst, src, vlen_enc);
1697       } else {
1698         vbroadcastss(dst, src, vlen_enc);
1699       }
1700     }
1701   } else if (VM_Version::supports_sse3()) {
1702     movddup(dst, src);
1703   } else {
1704     load_vector(bt, dst, src, vlen);
1705   }
1706 }
1707 
1708 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1709   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1710   int offset = exact_log2(type2aelembytes(bt)) << 6;
1711   if (is_floating_point_type(bt)) {
1712     offset += 128;
1713   }
1714   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1715   load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1716 }
1717 
1718 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1719 
1720 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1721   int vector_len = Assembler::AVX_128bit;
1722 
1723   switch (opcode) {
1724     case Op_AndReductionV:  pand(dst, src); break;
1725     case Op_OrReductionV:   por (dst, src); break;
1726     case Op_XorReductionV:  pxor(dst, src); break;
1727     case Op_MinReductionV:
1728       switch (typ) {
1729         case T_BYTE:        pminsb(dst, src); break;
1730         case T_SHORT:       pminsw(dst, src); break;
1731         case T_INT:         pminsd(dst, src); break;
1732         case T_LONG:        assert(UseAVX > 2, "required");
1733                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1734         default:            assert(false, "wrong type");
1735       }
1736       break;
1737     case Op_MaxReductionV:
1738       switch (typ) {
1739         case T_BYTE:        pmaxsb(dst, src); break;
1740         case T_SHORT:       pmaxsw(dst, src); break;
1741         case T_INT:         pmaxsd(dst, src); break;
1742         case T_LONG:        assert(UseAVX > 2, "required");
1743                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1744         default:            assert(false, "wrong type");
1745       }
1746       break;
1747     case Op_UMinReductionV:
1748       switch (typ) {
1749         case T_BYTE:        vpminub(dst, dst, src, Assembler::AVX_128bit); break;
1750         case T_SHORT:       vpminuw(dst, dst, src, Assembler::AVX_128bit); break;
1751         case T_INT:         vpminud(dst, dst, src, Assembler::AVX_128bit); break;
1752         case T_LONG:        evpminuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break;
1753         default:            assert(false, "wrong type");
1754       }
1755       break;
1756     case Op_UMaxReductionV:
1757       switch (typ) {
1758         case T_BYTE:        vpmaxub(dst, dst, src, Assembler::AVX_128bit); break;
1759         case T_SHORT:       vpmaxuw(dst, dst, src, Assembler::AVX_128bit); break;
1760         case T_INT:         vpmaxud(dst, dst, src, Assembler::AVX_128bit); break;
1761         case T_LONG:        evpmaxuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break;
1762         default:            assert(false, "wrong type");
1763       }
1764       break;
1765     case Op_AddReductionVF: addss(dst, src); break;
1766     case Op_AddReductionVD: addsd(dst, src); break;
1767     case Op_AddReductionVI:
1768       switch (typ) {
1769         case T_BYTE:        paddb(dst, src); break;
1770         case T_SHORT:       paddw(dst, src); break;
1771         case T_INT:         paddd(dst, src); break;
1772         default:            assert(false, "wrong type");
1773       }
1774       break;
1775     case Op_AddReductionVL: paddq(dst, src); break;
1776     case Op_MulReductionVF: mulss(dst, src); break;
1777     case Op_MulReductionVD: mulsd(dst, src); break;
1778     case Op_MulReductionVI:
1779       switch (typ) {
1780         case T_SHORT:       pmullw(dst, src); break;
1781         case T_INT:         pmulld(dst, src); break;
1782         default:            assert(false, "wrong type");
1783       }
1784       break;
1785     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1786                             evpmullq(dst, dst, src, vector_len); break;
1787     default:                assert(false, "wrong opcode");
1788   }
1789 }
1790 
1791 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1792   switch (opcode) {
1793     case Op_AddReductionVF: addps(dst, src); break;
1794     case Op_AddReductionVD: addpd(dst, src); break;
1795     case Op_MulReductionVF: mulps(dst, src); break;
1796     case Op_MulReductionVD: mulpd(dst, src); break;
1797     default:                assert(false, "%s", NodeClassNames[opcode]);
1798   }
1799 }
1800 
1801 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1802   int vector_len = Assembler::AVX_256bit;
1803 
1804   switch (opcode) {
1805     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1806     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1807     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1808     case Op_MinReductionV:
1809       switch (typ) {
1810         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1811         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1812         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1813         case T_LONG:        assert(UseAVX > 2, "required");
1814                             vpminsq(dst, src1, src2, vector_len); break;
1815         default:            assert(false, "wrong type");
1816       }
1817       break;
1818     case Op_MaxReductionV:
1819       switch (typ) {
1820         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1821         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1822         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1823         case T_LONG:        assert(UseAVX > 2, "required");
1824                             vpmaxsq(dst, src1, src2, vector_len); break;
1825         default:            assert(false, "wrong type");
1826       }
1827       break;
1828     case Op_UMinReductionV:
1829       switch (typ) {
1830         case T_BYTE:        vpminub(dst, src1, src2, vector_len); break;
1831         case T_SHORT:       vpminuw(dst, src1, src2, vector_len); break;
1832         case T_INT:         vpminud(dst, src1, src2, vector_len); break;
1833         case T_LONG:        evpminuq(dst, k0, src1, src2, true, vector_len); break;
1834         default:            assert(false, "wrong type");
1835       }
1836       break;
1837     case Op_UMaxReductionV:
1838       switch (typ) {
1839         case T_BYTE:        vpmaxub(dst, src1, src2, vector_len); break;
1840         case T_SHORT:       vpmaxuw(dst, src1, src2, vector_len); break;
1841         case T_INT:         vpmaxud(dst, src1, src2, vector_len); break;
1842         case T_LONG:        evpmaxuq(dst, k0, src1, src2, true, vector_len); break;
1843         default:            assert(false, "wrong type");
1844       }
1845       break;
1846     case Op_AddReductionVI:
1847       switch (typ) {
1848         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1849         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1850         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1851         default:            assert(false, "wrong type");
1852       }
1853       break;
1854     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1855     case Op_MulReductionVI:
1856       switch (typ) {
1857         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1858         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1859         default:            assert(false, "wrong type");
1860       }
1861       break;
1862     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1863     default:                assert(false, "wrong opcode");
1864   }
1865 }
1866 
1867 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1868   int vector_len = Assembler::AVX_256bit;
1869 
1870   switch (opcode) {
1871     case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1872     case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1873     case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1874     case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1875     default:                assert(false, "%s", NodeClassNames[opcode]);
1876   }
1877 }
1878 
1879 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1880                                   XMMRegister dst, XMMRegister src,
1881                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1882   switch (opcode) {
1883     case Op_AddReductionVF:
1884     case Op_MulReductionVF:
1885       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1886       break;
1887 
1888     case Op_AddReductionVD:
1889     case Op_MulReductionVD:
1890       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1891       break;
1892 
1893     default: assert(false, "wrong opcode");
1894   }
1895 }
1896 
1897 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1898                                             XMMRegister dst, XMMRegister src,
1899                                             XMMRegister vtmp1, XMMRegister vtmp2) {
1900   switch (opcode) {
1901     case Op_AddReductionVF:
1902     case Op_MulReductionVF:
1903       unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1904       break;
1905 
1906     case Op_AddReductionVD:
1907     case Op_MulReductionVD:
1908       unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1909       break;
1910 
1911     default: assert(false, "%s", NodeClassNames[opcode]);
1912   }
1913 }
1914 
1915 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1916                              Register dst, Register src1, XMMRegister src2,
1917                              XMMRegister vtmp1, XMMRegister vtmp2) {
1918   switch (vlen) {
1919     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1920     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1921     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1922     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1923 
1924     default: assert(false, "wrong vector length");
1925   }
1926 }
1927 
1928 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1929                              Register dst, Register src1, XMMRegister src2,
1930                              XMMRegister vtmp1, XMMRegister vtmp2) {
1931   switch (vlen) {
1932     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1933     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1934     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1935     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1936 
1937     default: assert(false, "wrong vector length");
1938   }
1939 }
1940 
1941 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1942                              Register dst, Register src1, XMMRegister src2,
1943                              XMMRegister vtmp1, XMMRegister vtmp2) {
1944   switch (vlen) {
1945     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1946     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1947     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1948     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1949 
1950     default: assert(false, "wrong vector length");
1951   }
1952 }
1953 
1954 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1955                              Register dst, Register src1, XMMRegister src2,
1956                              XMMRegister vtmp1, XMMRegister vtmp2) {
1957   switch (vlen) {
1958     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1959     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1960     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1961     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1962 
1963     default: assert(false, "wrong vector length");
1964   }
1965 }
1966 
1967 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1968                              Register dst, Register src1, XMMRegister src2,
1969                              XMMRegister vtmp1, XMMRegister vtmp2) {
1970   switch (vlen) {
1971     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1972     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1973     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1974 
1975     default: assert(false, "wrong vector length");
1976   }
1977 }
1978 
1979 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1980   switch (vlen) {
1981     case 2:
1982       assert(vtmp2 == xnoreg, "");
1983       reduce2F(opcode, dst, src, vtmp1);
1984       break;
1985     case 4:
1986       assert(vtmp2 == xnoreg, "");
1987       reduce4F(opcode, dst, src, vtmp1);
1988       break;
1989     case 8:
1990       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1991       break;
1992     case 16:
1993       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1994       break;
1995     default: assert(false, "wrong vector length");
1996   }
1997 }
1998 
1999 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2000   switch (vlen) {
2001     case 2:
2002       assert(vtmp2 == xnoreg, "");
2003       reduce2D(opcode, dst, src, vtmp1);
2004       break;
2005     case 4:
2006       reduce4D(opcode, dst, src, vtmp1, vtmp2);
2007       break;
2008     case 8:
2009       reduce8D(opcode, dst, src, vtmp1, vtmp2);
2010       break;
2011     default: assert(false, "wrong vector length");
2012   }
2013 }
2014 
2015 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2016   switch (vlen) {
2017     case 2:
2018       assert(vtmp1 == xnoreg, "");
2019       assert(vtmp2 == xnoreg, "");
2020       unorderedReduce2F(opcode, dst, src);
2021       break;
2022     case 4:
2023       assert(vtmp2 == xnoreg, "");
2024       unorderedReduce4F(opcode, dst, src, vtmp1);
2025       break;
2026     case 8:
2027       unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
2028       break;
2029     case 16:
2030       unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
2031       break;
2032     default: assert(false, "wrong vector length");
2033   }
2034 }
2035 
2036 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2037   switch (vlen) {
2038     case 2:
2039       assert(vtmp1 == xnoreg, "");
2040       assert(vtmp2 == xnoreg, "");
2041       unorderedReduce2D(opcode, dst, src);
2042       break;
2043     case 4:
2044       assert(vtmp2 == xnoreg, "");
2045       unorderedReduce4D(opcode, dst, src, vtmp1);
2046       break;
2047     case 8:
2048       unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2049       break;
2050     default: assert(false, "wrong vector length");
2051   }
2052 }
2053 
2054 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2055   if (opcode == Op_AddReductionVI) {
2056     if (vtmp1 != src2) {
2057       movdqu(vtmp1, src2);
2058     }
2059     phaddd(vtmp1, vtmp1);
2060   } else {
2061     pshufd(vtmp1, src2, 0x1);
2062     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2063   }
2064   movdl(vtmp2, src1);
2065   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2066   movdl(dst, vtmp1);
2067 }
2068 
2069 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2070   if (opcode == Op_AddReductionVI) {
2071     if (vtmp1 != src2) {
2072       movdqu(vtmp1, src2);
2073     }
2074     phaddd(vtmp1, src2);
2075     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2076   } else {
2077     pshufd(vtmp2, src2, 0xE);
2078     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2079     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2080   }
2081 }
2082 
2083 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2084   if (opcode == Op_AddReductionVI) {
2085     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2086     vextracti128_high(vtmp2, vtmp1);
2087     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2088     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2089   } else {
2090     vextracti128_high(vtmp1, src2);
2091     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2092     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2093   }
2094 }
2095 
2096 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2097   vextracti64x4_high(vtmp2, src2);
2098   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2099   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2100 }
2101 
2102 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2103   pshufd(vtmp2, src2, 0x1);
2104   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2105   movdqu(vtmp1, vtmp2);
2106   psrldq(vtmp1, 2);
2107   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2108   movdqu(vtmp2, vtmp1);
2109   psrldq(vtmp2, 1);
2110   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2111   movdl(vtmp2, src1);
2112   if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) {
2113     pmovzxbd(vtmp1, vtmp1);
2114   } else {
2115     pmovsxbd(vtmp1, vtmp1);
2116   }
2117   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2118   pextrb(dst, vtmp1, 0x0);
2119   movsbl(dst, dst);
2120 }
2121 
2122 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2123   pshufd(vtmp1, src2, 0xE);
2124   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2125   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2126 }
2127 
2128 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2129   vextracti128_high(vtmp2, src2);
2130   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2131   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2132 }
2133 
2134 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2135   vextracti64x4_high(vtmp1, src2);
2136   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2137   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2138 }
2139 
2140 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2141   pmovsxbw(vtmp2, src2);
2142   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2143 }
2144 
2145 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2146   if (UseAVX > 1) {
2147     int vector_len = Assembler::AVX_256bit;
2148     vpmovsxbw(vtmp1, src2, vector_len);
2149     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2150   } else {
2151     pmovsxbw(vtmp2, src2);
2152     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2153     pshufd(vtmp2, src2, 0xe);
2154     pmovsxbw(vtmp2, vtmp2);
2155     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2156   }
2157 }
2158 
2159 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2160   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2161     int vector_len = Assembler::AVX_512bit;
2162     vpmovsxbw(vtmp1, src2, vector_len);
2163     reduce32S(opcode, dst, src1, vtmp1, vtmp2, vtmp1);
2164   } else {
2165     assert(UseAVX >= 2,"Should not reach here.");
2166     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2167     vextracti128_high(vtmp2, src2);
2168     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2169   }
2170 }
2171 
2172 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2173   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2174   vextracti64x4_high(vtmp2, src2);
2175   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2176 }
2177 
2178 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2179   if (opcode == Op_AddReductionVI) {
2180     if (vtmp1 != src2) {
2181       movdqu(vtmp1, src2);
2182     }
2183     phaddw(vtmp1, vtmp1);
2184     phaddw(vtmp1, vtmp1);
2185   } else {
2186     pshufd(vtmp2, src2, 0x1);
2187     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2188     movdqu(vtmp1, vtmp2);
2189     psrldq(vtmp1, 2);
2190     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2191   }
2192   movdl(vtmp2, src1);
2193   if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) {
2194     pmovzxwd(vtmp1, vtmp1);
2195   } else {
2196     pmovsxwd(vtmp1, vtmp1);
2197   }
2198   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2199   pextrw(dst, vtmp1, 0x0);
2200   movswl(dst, dst);
2201 }
2202 
2203 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2204   if (opcode == Op_AddReductionVI) {
2205     if (vtmp1 != src2) {
2206       movdqu(vtmp1, src2);
2207     }
2208     phaddw(vtmp1, src2);
2209   } else {
2210     assert_different_registers(src2, vtmp1);
2211     pshufd(vtmp1, src2, 0xE);
2212     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2213   }
2214   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2215 }
2216 
2217 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2218   if (opcode == Op_AddReductionVI) {
2219     int vector_len = Assembler::AVX_256bit;
2220     vphaddw(vtmp2, src2, src2, vector_len);
2221     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2222   } else {
2223     assert_different_registers(src2, vtmp2);
2224     vextracti128_high(vtmp2, src2);
2225     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2226   }
2227   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2228 }
2229 
2230 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2231   assert_different_registers(src2, vtmp1);
2232   int vector_len = Assembler::AVX_256bit;
2233   vextracti64x4_high(vtmp1, src2);
2234   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2235   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2236 }
2237 
2238 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2239   pshufd(vtmp2, src2, 0xE);
2240   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2241   movdq(vtmp1, src1);
2242   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2243   movdq(dst, vtmp1);
2244 }
2245 
2246 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2247   vextracti128_high(vtmp1, src2);
2248   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2249   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2250 }
2251 
2252 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2253   vextracti64x4_high(vtmp2, src2);
2254   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2255   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2256 }
2257 
2258 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2259   mov64(temp, -1L);
2260   bzhiq(temp, temp, len);
2261   kmovql(dst, temp);
2262 }
2263 
2264 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2265   reduce_operation_128(T_FLOAT, opcode, dst, src);
2266   pshufd(vtmp, src, 0x1);
2267   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2268 }
2269 
2270 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2271   reduce2F(opcode, dst, src, vtmp);
2272   pshufd(vtmp, src, 0x2);
2273   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2274   pshufd(vtmp, src, 0x3);
2275   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2276 }
2277 
2278 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2279   reduce4F(opcode, dst, src, vtmp2);
2280   vextractf128_high(vtmp2, src);
2281   reduce4F(opcode, dst, vtmp2, vtmp1);
2282 }
2283 
2284 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2285   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2286   vextracti64x4_high(vtmp1, src);
2287   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2288 }
2289 
2290 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2291   pshufd(dst, src, 0x1);
2292   reduce_operation_128(T_FLOAT, opcode, dst, src);
2293 }
2294 
2295 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2296   pshufd(vtmp, src, 0xE);
2297   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2298   unorderedReduce2F(opcode, dst, vtmp);
2299 }
2300 
2301 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2302   vextractf128_high(vtmp1, src);
2303   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2304   unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2305 }
2306 
2307 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2308   vextractf64x4_high(vtmp2, src);
2309   unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2310   unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2311 }
2312 
2313 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2314   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2315   pshufd(vtmp, src, 0xE);
2316   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2317 }
2318 
2319 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2320   reduce2D(opcode, dst, src, vtmp2);
2321   vextractf128_high(vtmp2, src);
2322   reduce2D(opcode, dst, vtmp2, vtmp1);
2323 }
2324 
2325 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2326   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2327   vextracti64x4_high(vtmp1, src);
2328   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2329 }
2330 
2331 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2332   pshufd(dst, src, 0xE);
2333   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2334 }
2335 
2336 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2337   vextractf128_high(vtmp, src);
2338   unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2339   unorderedReduce2D(opcode, dst, vtmp);
2340 }
2341 
2342 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2343   vextractf64x4_high(vtmp2, src);
2344   unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2345   unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2346 }
2347 
2348 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2349   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2350 }
2351 
2352 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2353   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2354 }
2355 
2356 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2357   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2358 }
2359 
2360 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2361                                  int vec_enc) {
2362   switch(elem_bt) {
2363     case T_INT:
2364     case T_FLOAT:
2365       vmaskmovps(dst, src, mask, vec_enc);
2366       break;
2367     case T_LONG:
2368     case T_DOUBLE:
2369       vmaskmovpd(dst, src, mask, vec_enc);
2370       break;
2371     default:
2372       fatal("Unsupported type %s", type2name(elem_bt));
2373       break;
2374   }
2375 }
2376 
2377 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2378                                  int vec_enc) {
2379   switch(elem_bt) {
2380     case T_INT:
2381     case T_FLOAT:
2382       vmaskmovps(dst, src, mask, vec_enc);
2383       break;
2384     case T_LONG:
2385     case T_DOUBLE:
2386       vmaskmovpd(dst, src, mask, vec_enc);
2387       break;
2388     default:
2389       fatal("Unsupported type %s", type2name(elem_bt));
2390       break;
2391   }
2392 }
2393 
2394 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2395                                           XMMRegister dst, XMMRegister src,
2396                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2397                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2398   const int permconst[] = {1, 14};
2399   XMMRegister wsrc = src;
2400   XMMRegister wdst = xmm_0;
2401   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2402 
2403   int vlen_enc = Assembler::AVX_128bit;
2404   if (vlen == 16) {
2405     vlen_enc = Assembler::AVX_256bit;
2406   }
2407 
2408   for (int i = log2(vlen) - 1; i >=0; i--) {
2409     if (i == 0 && !is_dst_valid) {
2410       wdst = dst;
2411     }
2412     if (i == 3) {
2413       vextracti64x4_high(wtmp, wsrc);
2414     } else if (i == 2) {
2415       vextracti128_high(wtmp, wsrc);
2416     } else { // i = [0,1]
2417       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2418     }
2419 
2420     if (VM_Version::supports_avx10_2()) {
2421       vminmax_fp_avx10_2(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc);
2422     } else {
2423       vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2424     }
2425     wsrc = wdst;
2426     vlen_enc = Assembler::AVX_128bit;
2427   }
2428   if (is_dst_valid) {
2429     if (VM_Version::supports_avx10_2()) {
2430       vminmax_fp_avx10_2(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit);
2431     } else {
2432       vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2433     }
2434   }
2435 }
2436 
2437 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2438                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2439                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2440   XMMRegister wsrc = src;
2441   XMMRegister wdst = xmm_0;
2442   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2443   int vlen_enc = Assembler::AVX_128bit;
2444   if (vlen == 8) {
2445     vlen_enc = Assembler::AVX_256bit;
2446   }
2447   for (int i = log2(vlen) - 1; i >=0; i--) {
2448     if (i == 0 && !is_dst_valid) {
2449       wdst = dst;
2450     }
2451     if (i == 1) {
2452       vextracti128_high(wtmp, wsrc);
2453     } else if (i == 2) {
2454       vextracti64x4_high(wtmp, wsrc);
2455     } else {
2456       assert(i == 0, "%d", i);
2457       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2458     }
2459 
2460     if (VM_Version::supports_avx10_2()) {
2461       vminmax_fp_avx10_2(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc);
2462     } else {
2463       vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2464     }
2465 
2466     wsrc = wdst;
2467     vlen_enc = Assembler::AVX_128bit;
2468   }
2469 
2470   if (is_dst_valid) {
2471     if (VM_Version::supports_avx10_2()) {
2472       vminmax_fp_avx10_2(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit);
2473     } else {
2474       vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2475     }
2476   }
2477 }
2478 
2479 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2480   switch (bt) {
2481     case T_BYTE:  pextrb(dst, src, idx); break;
2482     case T_SHORT: pextrw(dst, src, idx); break;
2483     case T_INT:   pextrd(dst, src, idx); break;
2484     case T_LONG:  pextrq(dst, src, idx); break;
2485 
2486     default:
2487       assert(false,"Should not reach here.");
2488       break;
2489   }
2490 }
2491 
2492 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2493   int esize =  type2aelembytes(typ);
2494   int elem_per_lane = 16/esize;
2495   int lane = elemindex / elem_per_lane;
2496   int eindex = elemindex % elem_per_lane;
2497 
2498   if (lane >= 2) {
2499     assert(UseAVX > 2, "required");
2500     vextractf32x4(dst, src, lane & 3);
2501     return dst;
2502   } else if (lane > 0) {
2503     assert(UseAVX > 0, "required");
2504     vextractf128(dst, src, lane);
2505     return dst;
2506   } else {
2507     return src;
2508   }
2509 }
2510 
2511 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2512   if (typ == T_BYTE) {
2513     movsbl(dst, dst);
2514   } else if (typ == T_SHORT) {
2515     movswl(dst, dst);
2516   }
2517 }
2518 
2519 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2520   int esize =  type2aelembytes(typ);
2521   int elem_per_lane = 16/esize;
2522   int eindex = elemindex % elem_per_lane;
2523   assert(is_integral_type(typ),"required");
2524 
2525   if (eindex == 0) {
2526     if (typ == T_LONG) {
2527       movq(dst, src);
2528     } else {
2529       movdl(dst, src);
2530       movsxl(typ, dst);
2531     }
2532   } else {
2533     extract(typ, dst, src, eindex);
2534     movsxl(typ, dst);
2535   }
2536 }
2537 
2538 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2539   int esize =  type2aelembytes(typ);
2540   int elem_per_lane = 16/esize;
2541   int eindex = elemindex % elem_per_lane;
2542   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2543 
2544   if (eindex == 0) {
2545     movq(dst, src);
2546   } else {
2547     if (typ == T_FLOAT) {
2548       if (UseAVX == 0) {
2549         movdqu(dst, src);
2550         shufps(dst, dst, eindex);
2551       } else {
2552         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2553       }
2554     } else {
2555       if (UseAVX == 0) {
2556         movdqu(dst, src);
2557         psrldq(dst, eindex*esize);
2558       } else {
2559         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2560       }
2561       movq(dst, dst);
2562     }
2563   }
2564   // Zero upper bits
2565   if (typ == T_FLOAT) {
2566     if (UseAVX == 0) {
2567       assert(vtmp != xnoreg, "required.");
2568       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2569       pand(dst, vtmp);
2570     } else {
2571       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2572     }
2573   }
2574 }
2575 
2576 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2577   switch(typ) {
2578     case T_BYTE:
2579     case T_BOOLEAN:
2580       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2581       break;
2582     case T_SHORT:
2583     case T_CHAR:
2584       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2585       break;
2586     case T_INT:
2587     case T_FLOAT:
2588       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2589       break;
2590     case T_LONG:
2591     case T_DOUBLE:
2592       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2593       break;
2594     default:
2595       assert(false,"Should not reach here.");
2596       break;
2597   }
2598 }
2599 
2600 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2601   assert(rscratch != noreg || always_reachable(src2), "missing");
2602 
2603   switch(typ) {
2604     case T_BOOLEAN:
2605     case T_BYTE:
2606       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2607       break;
2608     case T_CHAR:
2609     case T_SHORT:
2610       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2611       break;
2612     case T_INT:
2613     case T_FLOAT:
2614       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2615       break;
2616     case T_LONG:
2617     case T_DOUBLE:
2618       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2619       break;
2620     default:
2621       assert(false,"Should not reach here.");
2622       break;
2623   }
2624 }
2625 
2626 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2627   switch(typ) {
2628     case T_BYTE:
2629       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2630       break;
2631     case T_SHORT:
2632       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2633       break;
2634     case T_INT:
2635     case T_FLOAT:
2636       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2637       break;
2638     case T_LONG:
2639     case T_DOUBLE:
2640       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2641       break;
2642     default:
2643       assert(false,"Should not reach here.");
2644       break;
2645   }
2646 }
2647 
2648 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2649   assert(vlen_in_bytes <= 32, "");
2650   int esize = type2aelembytes(bt);
2651   if (vlen_in_bytes == 32) {
2652     assert(vtmp == xnoreg, "required.");
2653     if (esize >= 4) {
2654       vtestps(src1, src2, AVX_256bit);
2655     } else {
2656       vptest(src1, src2, AVX_256bit);
2657     }
2658     return;
2659   }
2660   if (vlen_in_bytes < 16) {
2661     // Duplicate the lower part to fill the whole register,
2662     // Don't need to do so for src2
2663     assert(vtmp != xnoreg, "required");
2664     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2665     pshufd(vtmp, src1, shuffle_imm);
2666   } else {
2667     assert(vtmp == xnoreg, "required");
2668     vtmp = src1;
2669   }
2670   if (esize >= 4 && VM_Version::supports_avx()) {
2671     vtestps(vtmp, src2, AVX_128bit);
2672   } else {
2673     ptest(vtmp, src2);
2674   }
2675 }
2676 
2677 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2678 #ifdef ASSERT
2679   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2680   bool is_bw_supported = VM_Version::supports_avx512bw();
2681   if (is_bw && !is_bw_supported) {
2682     assert(vlen_enc != Assembler::AVX_512bit, "required");
2683     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2684            "XMM register should be 0-15");
2685   }
2686 #endif // ASSERT
2687   switch (elem_bt) {
2688     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2689     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2690     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2691     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2692     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2693     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2694     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2695   }
2696 }
2697 
2698 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2699   assert(UseAVX >= 2, "required");
2700   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2701   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2702   if ((UseAVX > 2) &&
2703       (!is_bw || VM_Version::supports_avx512bw()) &&
2704       (!is_vl || VM_Version::supports_avx512vl())) {
2705     switch (elem_bt) {
2706       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2707       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2708       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2709       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2710       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2711     }
2712   } else {
2713     assert(vlen_enc != Assembler::AVX_512bit, "required");
2714     assert((dst->encoding() < 16),"XMM register should be 0-15");
2715     switch (elem_bt) {
2716       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2717       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2718       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2719       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2720       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2721       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2722       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2723     }
2724   }
2725 }
2726 
2727 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2728   switch (to_elem_bt) {
2729     case T_SHORT:
2730       vpmovsxbw(dst, src, vlen_enc);
2731       break;
2732     case T_INT:
2733       vpmovsxbd(dst, src, vlen_enc);
2734       break;
2735     case T_FLOAT:
2736       vpmovsxbd(dst, src, vlen_enc);
2737       vcvtdq2ps(dst, dst, vlen_enc);
2738       break;
2739     case T_LONG:
2740       vpmovsxbq(dst, src, vlen_enc);
2741       break;
2742     case T_DOUBLE: {
2743       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2744       vpmovsxbd(dst, src, mid_vlen_enc);
2745       vcvtdq2pd(dst, dst, vlen_enc);
2746       break;
2747     }
2748     default:
2749       fatal("Unsupported type %s", type2name(to_elem_bt));
2750       break;
2751   }
2752 }
2753 
2754 //-------------------------------------------------------------------------------------------
2755 
2756 // IndexOf for constant substrings with size >= 8 chars
2757 // which don't need to be loaded through stack.
2758 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2759                                          Register cnt1, Register cnt2,
2760                                          int int_cnt2,  Register result,
2761                                          XMMRegister vec, Register tmp,
2762                                          int ae) {
2763   ShortBranchVerifier sbv(this);
2764   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2765   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2766 
2767   // This method uses the pcmpestri instruction with bound registers
2768   //   inputs:
2769   //     xmm - substring
2770   //     rax - substring length (elements count)
2771   //     mem - scanned string
2772   //     rdx - string length (elements count)
2773   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2774   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2775   //   outputs:
2776   //     rcx - matched index in string
2777   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2778   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2779   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2780   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2781   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2782 
2783   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2784         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2785         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2786 
2787   // Note, inline_string_indexOf() generates checks:
2788   // if (substr.count > string.count) return -1;
2789   // if (substr.count == 0) return 0;
2790   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2791 
2792   // Load substring.
2793   if (ae == StrIntrinsicNode::UL) {
2794     pmovzxbw(vec, Address(str2, 0));
2795   } else {
2796     movdqu(vec, Address(str2, 0));
2797   }
2798   movl(cnt2, int_cnt2);
2799   movptr(result, str1); // string addr
2800 
2801   if (int_cnt2 > stride) {
2802     jmpb(SCAN_TO_SUBSTR);
2803 
2804     // Reload substr for rescan, this code
2805     // is executed only for large substrings (> 8 chars)
2806     bind(RELOAD_SUBSTR);
2807     if (ae == StrIntrinsicNode::UL) {
2808       pmovzxbw(vec, Address(str2, 0));
2809     } else {
2810       movdqu(vec, Address(str2, 0));
2811     }
2812     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2813 
2814     bind(RELOAD_STR);
2815     // We came here after the beginning of the substring was
2816     // matched but the rest of it was not so we need to search
2817     // again. Start from the next element after the previous match.
2818 
2819     // cnt2 is number of substring reminding elements and
2820     // cnt1 is number of string reminding elements when cmp failed.
2821     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2822     subl(cnt1, cnt2);
2823     addl(cnt1, int_cnt2);
2824     movl(cnt2, int_cnt2); // Now restore cnt2
2825 
2826     decrementl(cnt1);     // Shift to next element
2827     cmpl(cnt1, cnt2);
2828     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2829 
2830     addptr(result, (1<<scale1));
2831 
2832   } // (int_cnt2 > 8)
2833 
2834   // Scan string for start of substr in 16-byte vectors
2835   bind(SCAN_TO_SUBSTR);
2836   pcmpestri(vec, Address(result, 0), mode);
2837   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2838   subl(cnt1, stride);
2839   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2840   cmpl(cnt1, cnt2);
2841   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2842   addptr(result, 16);
2843   jmpb(SCAN_TO_SUBSTR);
2844 
2845   // Found a potential substr
2846   bind(FOUND_CANDIDATE);
2847   // Matched whole vector if first element matched (tmp(rcx) == 0).
2848   if (int_cnt2 == stride) {
2849     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2850   } else { // int_cnt2 > 8
2851     jccb(Assembler::overflow, FOUND_SUBSTR);
2852   }
2853   // After pcmpestri tmp(rcx) contains matched element index
2854   // Compute start addr of substr
2855   lea(result, Address(result, tmp, scale1));
2856 
2857   // Make sure string is still long enough
2858   subl(cnt1, tmp);
2859   cmpl(cnt1, cnt2);
2860   if (int_cnt2 == stride) {
2861     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2862   } else { // int_cnt2 > 8
2863     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2864   }
2865   // Left less then substring.
2866 
2867   bind(RET_NOT_FOUND);
2868   movl(result, -1);
2869   jmp(EXIT);
2870 
2871   if (int_cnt2 > stride) {
2872     // This code is optimized for the case when whole substring
2873     // is matched if its head is matched.
2874     bind(MATCH_SUBSTR_HEAD);
2875     pcmpestri(vec, Address(result, 0), mode);
2876     // Reload only string if does not match
2877     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2878 
2879     Label CONT_SCAN_SUBSTR;
2880     // Compare the rest of substring (> 8 chars).
2881     bind(FOUND_SUBSTR);
2882     // First 8 chars are already matched.
2883     negptr(cnt2);
2884     addptr(cnt2, stride);
2885 
2886     bind(SCAN_SUBSTR);
2887     subl(cnt1, stride);
2888     cmpl(cnt2, -stride); // Do not read beyond substring
2889     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2890     // Back-up strings to avoid reading beyond substring:
2891     // cnt1 = cnt1 - cnt2 + 8
2892     addl(cnt1, cnt2); // cnt2 is negative
2893     addl(cnt1, stride);
2894     movl(cnt2, stride); negptr(cnt2);
2895     bind(CONT_SCAN_SUBSTR);
2896     if (int_cnt2 < (int)G) {
2897       int tail_off1 = int_cnt2<<scale1;
2898       int tail_off2 = int_cnt2<<scale2;
2899       if (ae == StrIntrinsicNode::UL) {
2900         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2901       } else {
2902         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2903       }
2904       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2905     } else {
2906       // calculate index in register to avoid integer overflow (int_cnt2*2)
2907       movl(tmp, int_cnt2);
2908       addptr(tmp, cnt2);
2909       if (ae == StrIntrinsicNode::UL) {
2910         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2911       } else {
2912         movdqu(vec, Address(str2, tmp, scale2, 0));
2913       }
2914       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2915     }
2916     // Need to reload strings pointers if not matched whole vector
2917     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2918     addptr(cnt2, stride);
2919     jcc(Assembler::negative, SCAN_SUBSTR);
2920     // Fall through if found full substring
2921 
2922   } // (int_cnt2 > 8)
2923 
2924   bind(RET_FOUND);
2925   // Found result if we matched full small substring.
2926   // Compute substr offset
2927   subptr(result, str1);
2928   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2929     shrl(result, 1); // index
2930   }
2931   bind(EXIT);
2932 
2933 } // string_indexofC8
2934 
2935 // Small strings are loaded through stack if they cross page boundary.
2936 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2937                                        Register cnt1, Register cnt2,
2938                                        int int_cnt2,  Register result,
2939                                        XMMRegister vec, Register tmp,
2940                                        int ae) {
2941   ShortBranchVerifier sbv(this);
2942   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2943   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2944 
2945   //
2946   // int_cnt2 is length of small (< 8 chars) constant substring
2947   // or (-1) for non constant substring in which case its length
2948   // is in cnt2 register.
2949   //
2950   // Note, inline_string_indexOf() generates checks:
2951   // if (substr.count > string.count) return -1;
2952   // if (substr.count == 0) return 0;
2953   //
2954   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2955   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2956   // This method uses the pcmpestri instruction with bound registers
2957   //   inputs:
2958   //     xmm - substring
2959   //     rax - substring length (elements count)
2960   //     mem - scanned string
2961   //     rdx - string length (elements count)
2962   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2963   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2964   //   outputs:
2965   //     rcx - matched index in string
2966   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2967   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2968   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2969   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2970 
2971   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2972         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2973         FOUND_CANDIDATE;
2974 
2975   { //========================================================
2976     // We don't know where these strings are located
2977     // and we can't read beyond them. Load them through stack.
2978     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2979 
2980     movptr(tmp, rsp); // save old SP
2981 
2982     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2983       if (int_cnt2 == (1>>scale2)) { // One byte
2984         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2985         load_unsigned_byte(result, Address(str2, 0));
2986         movdl(vec, result); // move 32 bits
2987       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2988         // Not enough header space in 32-bit VM: 12+3 = 15.
2989         movl(result, Address(str2, -1));
2990         shrl(result, 8);
2991         movdl(vec, result); // move 32 bits
2992       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2993         load_unsigned_short(result, Address(str2, 0));
2994         movdl(vec, result); // move 32 bits
2995       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2996         movdl(vec, Address(str2, 0)); // move 32 bits
2997       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2998         movq(vec, Address(str2, 0));  // move 64 bits
2999       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
3000         // Array header size is 12 bytes in 32-bit VM
3001         // + 6 bytes for 3 chars == 18 bytes,
3002         // enough space to load vec and shift.
3003         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
3004         if (ae == StrIntrinsicNode::UL) {
3005           int tail_off = int_cnt2-8;
3006           pmovzxbw(vec, Address(str2, tail_off));
3007           psrldq(vec, -2*tail_off);
3008         }
3009         else {
3010           int tail_off = int_cnt2*(1<<scale2);
3011           movdqu(vec, Address(str2, tail_off-16));
3012           psrldq(vec, 16-tail_off);
3013         }
3014       }
3015     } else { // not constant substring
3016       cmpl(cnt2, stride);
3017       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3018 
3019       // We can read beyond string if srt+16 does not cross page boundary
3020       // since heaps are aligned and mapped by pages.
3021       assert(os::vm_page_size() < (int)G, "default page should be small");
3022       movl(result, str2); // We need only low 32 bits
3023       andl(result, ((int)os::vm_page_size()-1));
3024       cmpl(result, ((int)os::vm_page_size()-16));
3025       jccb(Assembler::belowEqual, CHECK_STR);
3026 
3027       // Move small strings to stack to allow load 16 bytes into vec.
3028       subptr(rsp, 16);
3029       int stk_offset = wordSize-(1<<scale2);
3030       push(cnt2);
3031 
3032       bind(COPY_SUBSTR);
3033       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3034         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3035         movb(Address(rsp, cnt2, scale2, stk_offset), result);
3036       } else if (ae == StrIntrinsicNode::UU) {
3037         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3038         movw(Address(rsp, cnt2, scale2, stk_offset), result);
3039       }
3040       decrement(cnt2);
3041       jccb(Assembler::notZero, COPY_SUBSTR);
3042 
3043       pop(cnt2);
3044       movptr(str2, rsp);  // New substring address
3045     } // non constant
3046 
3047     bind(CHECK_STR);
3048     cmpl(cnt1, stride);
3049     jccb(Assembler::aboveEqual, BIG_STRINGS);
3050 
3051     // Check cross page boundary.
3052     movl(result, str1); // We need only low 32 bits
3053     andl(result, ((int)os::vm_page_size()-1));
3054     cmpl(result, ((int)os::vm_page_size()-16));
3055     jccb(Assembler::belowEqual, BIG_STRINGS);
3056 
3057     subptr(rsp, 16);
3058     int stk_offset = -(1<<scale1);
3059     if (int_cnt2 < 0) { // not constant
3060       push(cnt2);
3061       stk_offset += wordSize;
3062     }
3063     movl(cnt2, cnt1);
3064 
3065     bind(COPY_STR);
3066     if (ae == StrIntrinsicNode::LL) {
3067       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3068       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3069     } else {
3070       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3071       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3072     }
3073     decrement(cnt2);
3074     jccb(Assembler::notZero, COPY_STR);
3075 
3076     if (int_cnt2 < 0) { // not constant
3077       pop(cnt2);
3078     }
3079     movptr(str1, rsp);  // New string address
3080 
3081     bind(BIG_STRINGS);
3082     // Load substring.
3083     if (int_cnt2 < 0) { // -1
3084       if (ae == StrIntrinsicNode::UL) {
3085         pmovzxbw(vec, Address(str2, 0));
3086       } else {
3087         movdqu(vec, Address(str2, 0));
3088       }
3089       push(cnt2);       // substr count
3090       push(str2);       // substr addr
3091       push(str1);       // string addr
3092     } else {
3093       // Small (< 8 chars) constant substrings are loaded already.
3094       movl(cnt2, int_cnt2);
3095     }
3096     push(tmp);  // original SP
3097 
3098   } // Finished loading
3099 
3100   //========================================================
3101   // Start search
3102   //
3103 
3104   movptr(result, str1); // string addr
3105 
3106   if (int_cnt2  < 0) {  // Only for non constant substring
3107     jmpb(SCAN_TO_SUBSTR);
3108 
3109     // SP saved at sp+0
3110     // String saved at sp+1*wordSize
3111     // Substr saved at sp+2*wordSize
3112     // Substr count saved at sp+3*wordSize
3113 
3114     // Reload substr for rescan, this code
3115     // is executed only for large substrings (> 8 chars)
3116     bind(RELOAD_SUBSTR);
3117     movptr(str2, Address(rsp, 2*wordSize));
3118     movl(cnt2, Address(rsp, 3*wordSize));
3119     if (ae == StrIntrinsicNode::UL) {
3120       pmovzxbw(vec, Address(str2, 0));
3121     } else {
3122       movdqu(vec, Address(str2, 0));
3123     }
3124     // We came here after the beginning of the substring was
3125     // matched but the rest of it was not so we need to search
3126     // again. Start from the next element after the previous match.
3127     subptr(str1, result); // Restore counter
3128     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3129       shrl(str1, 1);
3130     }
3131     addl(cnt1, str1);
3132     decrementl(cnt1);   // Shift to next element
3133     cmpl(cnt1, cnt2);
3134     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3135 
3136     addptr(result, (1<<scale1));
3137   } // non constant
3138 
3139   // Scan string for start of substr in 16-byte vectors
3140   bind(SCAN_TO_SUBSTR);
3141   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3142   pcmpestri(vec, Address(result, 0), mode);
3143   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3144   subl(cnt1, stride);
3145   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3146   cmpl(cnt1, cnt2);
3147   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3148   addptr(result, 16);
3149 
3150   bind(ADJUST_STR);
3151   cmpl(cnt1, stride); // Do not read beyond string
3152   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3153   // Back-up string to avoid reading beyond string.
3154   lea(result, Address(result, cnt1, scale1, -16));
3155   movl(cnt1, stride);
3156   jmpb(SCAN_TO_SUBSTR);
3157 
3158   // Found a potential substr
3159   bind(FOUND_CANDIDATE);
3160   // After pcmpestri tmp(rcx) contains matched element index
3161 
3162   // Make sure string is still long enough
3163   subl(cnt1, tmp);
3164   cmpl(cnt1, cnt2);
3165   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3166   // Left less then substring.
3167 
3168   bind(RET_NOT_FOUND);
3169   movl(result, -1);
3170   jmp(CLEANUP);
3171 
3172   bind(FOUND_SUBSTR);
3173   // Compute start addr of substr
3174   lea(result, Address(result, tmp, scale1));
3175   if (int_cnt2 > 0) { // Constant substring
3176     // Repeat search for small substring (< 8 chars)
3177     // from new point without reloading substring.
3178     // Have to check that we don't read beyond string.
3179     cmpl(tmp, stride-int_cnt2);
3180     jccb(Assembler::greater, ADJUST_STR);
3181     // Fall through if matched whole substring.
3182   } else { // non constant
3183     assert(int_cnt2 == -1, "should be != 0");
3184 
3185     addl(tmp, cnt2);
3186     // Found result if we matched whole substring.
3187     cmpl(tmp, stride);
3188     jcc(Assembler::lessEqual, RET_FOUND);
3189 
3190     // Repeat search for small substring (<= 8 chars)
3191     // from new point 'str1' without reloading substring.
3192     cmpl(cnt2, stride);
3193     // Have to check that we don't read beyond string.
3194     jccb(Assembler::lessEqual, ADJUST_STR);
3195 
3196     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3197     // Compare the rest of substring (> 8 chars).
3198     movptr(str1, result);
3199 
3200     cmpl(tmp, cnt2);
3201     // First 8 chars are already matched.
3202     jccb(Assembler::equal, CHECK_NEXT);
3203 
3204     bind(SCAN_SUBSTR);
3205     pcmpestri(vec, Address(str1, 0), mode);
3206     // Need to reload strings pointers if not matched whole vector
3207     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3208 
3209     bind(CHECK_NEXT);
3210     subl(cnt2, stride);
3211     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3212     addptr(str1, 16);
3213     if (ae == StrIntrinsicNode::UL) {
3214       addptr(str2, 8);
3215     } else {
3216       addptr(str2, 16);
3217     }
3218     subl(cnt1, stride);
3219     cmpl(cnt2, stride); // Do not read beyond substring
3220     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3221     // Back-up strings to avoid reading beyond substring.
3222 
3223     if (ae == StrIntrinsicNode::UL) {
3224       lea(str2, Address(str2, cnt2, scale2, -8));
3225       lea(str1, Address(str1, cnt2, scale1, -16));
3226     } else {
3227       lea(str2, Address(str2, cnt2, scale2, -16));
3228       lea(str1, Address(str1, cnt2, scale1, -16));
3229     }
3230     subl(cnt1, cnt2);
3231     movl(cnt2, stride);
3232     addl(cnt1, stride);
3233     bind(CONT_SCAN_SUBSTR);
3234     if (ae == StrIntrinsicNode::UL) {
3235       pmovzxbw(vec, Address(str2, 0));
3236     } else {
3237       movdqu(vec, Address(str2, 0));
3238     }
3239     jmp(SCAN_SUBSTR);
3240 
3241     bind(RET_FOUND_LONG);
3242     movptr(str1, Address(rsp, wordSize));
3243   } // non constant
3244 
3245   bind(RET_FOUND);
3246   // Compute substr offset
3247   subptr(result, str1);
3248   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3249     shrl(result, 1); // index
3250   }
3251   bind(CLEANUP);
3252   pop(rsp); // restore SP
3253 
3254 } // string_indexof
3255 
3256 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3257                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3258   ShortBranchVerifier sbv(this);
3259   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3260 
3261   int stride = 8;
3262 
3263   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3264         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3265         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3266         FOUND_SEQ_CHAR, DONE_LABEL;
3267 
3268   movptr(result, str1);
3269   if (UseAVX >= 2) {
3270     cmpl(cnt1, stride);
3271     jcc(Assembler::less, SCAN_TO_CHAR);
3272     cmpl(cnt1, 2*stride);
3273     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3274     movdl(vec1, ch);
3275     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3276     vpxor(vec2, vec2);
3277     movl(tmp, cnt1);
3278     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3279     andl(cnt1,0x0000000F);  //tail count (in chars)
3280 
3281     bind(SCAN_TO_16_CHAR_LOOP);
3282     vmovdqu(vec3, Address(result, 0));
3283     vpcmpeqw(vec3, vec3, vec1, 1);
3284     vptest(vec2, vec3);
3285     jcc(Assembler::carryClear, FOUND_CHAR);
3286     addptr(result, 32);
3287     subl(tmp, 2*stride);
3288     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3289     jmp(SCAN_TO_8_CHAR);
3290     bind(SCAN_TO_8_CHAR_INIT);
3291     movdl(vec1, ch);
3292     pshuflw(vec1, vec1, 0x00);
3293     pshufd(vec1, vec1, 0);
3294     pxor(vec2, vec2);
3295   }
3296   bind(SCAN_TO_8_CHAR);
3297   cmpl(cnt1, stride);
3298   jcc(Assembler::less, SCAN_TO_CHAR);
3299   if (UseAVX < 2) {
3300     movdl(vec1, ch);
3301     pshuflw(vec1, vec1, 0x00);
3302     pshufd(vec1, vec1, 0);
3303     pxor(vec2, vec2);
3304   }
3305   movl(tmp, cnt1);
3306   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3307   andl(cnt1,0x00000007);  //tail count (in chars)
3308 
3309   bind(SCAN_TO_8_CHAR_LOOP);
3310   movdqu(vec3, Address(result, 0));
3311   pcmpeqw(vec3, vec1);
3312   ptest(vec2, vec3);
3313   jcc(Assembler::carryClear, FOUND_CHAR);
3314   addptr(result, 16);
3315   subl(tmp, stride);
3316   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3317   bind(SCAN_TO_CHAR);
3318   testl(cnt1, cnt1);
3319   jcc(Assembler::zero, RET_NOT_FOUND);
3320   bind(SCAN_TO_CHAR_LOOP);
3321   load_unsigned_short(tmp, Address(result, 0));
3322   cmpl(ch, tmp);
3323   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3324   addptr(result, 2);
3325   subl(cnt1, 1);
3326   jccb(Assembler::zero, RET_NOT_FOUND);
3327   jmp(SCAN_TO_CHAR_LOOP);
3328 
3329   bind(RET_NOT_FOUND);
3330   movl(result, -1);
3331   jmpb(DONE_LABEL);
3332 
3333   bind(FOUND_CHAR);
3334   if (UseAVX >= 2) {
3335     vpmovmskb(tmp, vec3);
3336   } else {
3337     pmovmskb(tmp, vec3);
3338   }
3339   bsfl(ch, tmp);
3340   addptr(result, ch);
3341 
3342   bind(FOUND_SEQ_CHAR);
3343   subptr(result, str1);
3344   shrl(result, 1);
3345 
3346   bind(DONE_LABEL);
3347 } // string_indexof_char
3348 
3349 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3350                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3351   ShortBranchVerifier sbv(this);
3352   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3353 
3354   int stride = 16;
3355 
3356   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3357         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3358         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3359         FOUND_SEQ_CHAR, DONE_LABEL;
3360 
3361   movptr(result, str1);
3362   if (UseAVX >= 2) {
3363     cmpl(cnt1, stride);
3364     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3365     cmpl(cnt1, stride*2);
3366     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3367     movdl(vec1, ch);
3368     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3369     vpxor(vec2, vec2);
3370     movl(tmp, cnt1);
3371     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3372     andl(cnt1,0x0000001F);  //tail count (in chars)
3373 
3374     bind(SCAN_TO_32_CHAR_LOOP);
3375     vmovdqu(vec3, Address(result, 0));
3376     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3377     vptest(vec2, vec3);
3378     jcc(Assembler::carryClear, FOUND_CHAR);
3379     addptr(result, 32);
3380     subl(tmp, stride*2);
3381     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3382     jmp(SCAN_TO_16_CHAR);
3383 
3384     bind(SCAN_TO_16_CHAR_INIT);
3385     movdl(vec1, ch);
3386     pxor(vec2, vec2);
3387     pshufb(vec1, vec2);
3388   }
3389 
3390   bind(SCAN_TO_16_CHAR);
3391   cmpl(cnt1, stride);
3392   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3393   if (UseAVX < 2) {
3394     movdl(vec1, ch);
3395     pxor(vec2, vec2);
3396     pshufb(vec1, vec2);
3397   }
3398   movl(tmp, cnt1);
3399   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3400   andl(cnt1,0x0000000F);  //tail count (in bytes)
3401 
3402   bind(SCAN_TO_16_CHAR_LOOP);
3403   movdqu(vec3, Address(result, 0));
3404   pcmpeqb(vec3, vec1);
3405   ptest(vec2, vec3);
3406   jcc(Assembler::carryClear, FOUND_CHAR);
3407   addptr(result, 16);
3408   subl(tmp, stride);
3409   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3410 
3411   bind(SCAN_TO_CHAR_INIT);
3412   testl(cnt1, cnt1);
3413   jcc(Assembler::zero, RET_NOT_FOUND);
3414   bind(SCAN_TO_CHAR_LOOP);
3415   load_unsigned_byte(tmp, Address(result, 0));
3416   cmpl(ch, tmp);
3417   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3418   addptr(result, 1);
3419   subl(cnt1, 1);
3420   jccb(Assembler::zero, RET_NOT_FOUND);
3421   jmp(SCAN_TO_CHAR_LOOP);
3422 
3423   bind(RET_NOT_FOUND);
3424   movl(result, -1);
3425   jmpb(DONE_LABEL);
3426 
3427   bind(FOUND_CHAR);
3428   if (UseAVX >= 2) {
3429     vpmovmskb(tmp, vec3);
3430   } else {
3431     pmovmskb(tmp, vec3);
3432   }
3433   bsfl(ch, tmp);
3434   addptr(result, ch);
3435 
3436   bind(FOUND_SEQ_CHAR);
3437   subptr(result, str1);
3438 
3439   bind(DONE_LABEL);
3440 } // stringL_indexof_char
3441 
3442 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3443   switch (eltype) {
3444   case T_BOOLEAN: return sizeof(jboolean);
3445   case T_BYTE:  return sizeof(jbyte);
3446   case T_SHORT: return sizeof(jshort);
3447   case T_CHAR:  return sizeof(jchar);
3448   case T_INT:   return sizeof(jint);
3449   default:
3450     ShouldNotReachHere();
3451     return -1;
3452   }
3453 }
3454 
3455 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3456   switch (eltype) {
3457   // T_BOOLEAN used as surrogate for unsigned byte
3458   case T_BOOLEAN: movzbl(dst, src);   break;
3459   case T_BYTE:    movsbl(dst, src);   break;
3460   case T_SHORT:   movswl(dst, src);   break;
3461   case T_CHAR:    movzwl(dst, src);   break;
3462   case T_INT:     movl(dst, src);     break;
3463   default:
3464     ShouldNotReachHere();
3465   }
3466 }
3467 
3468 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3469   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3470 }
3471 
3472 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3473   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3474 }
3475 
3476 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3477   const int vlen = Assembler::AVX_256bit;
3478   switch (eltype) {
3479   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3480   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3481   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3482   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3483   case T_INT:
3484     // do nothing
3485     break;
3486   default:
3487     ShouldNotReachHere();
3488   }
3489 }
3490 
3491 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3492                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3493                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3494                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3495                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3496                                         BasicType eltype) {
3497   ShortBranchVerifier sbv(this);
3498   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3499   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3500   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3501 
3502   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3503         SHORT_UNROLLED_LOOP_EXIT,
3504         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3505         UNROLLED_VECTOR_LOOP_BEGIN,
3506         END;
3507   switch (eltype) {
3508   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3509   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3510   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3511   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3512   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3513   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3514   }
3515 
3516   // For "renaming" for readibility of the code
3517   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3518                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3519                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3520 
3521   const int elsize = arrays_hashcode_elsize(eltype);
3522 
3523   /*
3524     if (cnt1 >= 2) {
3525       if (cnt1 >= 32) {
3526         UNROLLED VECTOR LOOP
3527       }
3528       UNROLLED SCALAR LOOP
3529     }
3530     SINGLE SCALAR
3531    */
3532 
3533   cmpl(cnt1, 32);
3534   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3535 
3536   // cnt1 >= 32 && generate_vectorized_loop
3537   xorl(index, index);
3538 
3539   // vresult = IntVector.zero(I256);
3540   for (int idx = 0; idx < 4; idx++) {
3541     vpxor(vresult[idx], vresult[idx]);
3542   }
3543   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3544   Register bound = tmp2;
3545   Register next = tmp3;
3546   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3547   movl(next, Address(tmp2, 0));
3548   movdl(vnext, next);
3549   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3550 
3551   // index = 0;
3552   // bound = cnt1 & ~(32 - 1);
3553   movl(bound, cnt1);
3554   andl(bound, ~(32 - 1));
3555   // for (; index < bound; index += 32) {
3556   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3557   // result *= next;
3558   imull(result, next);
3559   // loop fission to upfront the cost of fetching from memory, OOO execution
3560   // can then hopefully do a better job of prefetching
3561   for (int idx = 0; idx < 4; idx++) {
3562     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3563   }
3564   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3565   for (int idx = 0; idx < 4; idx++) {
3566     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3567     arrays_hashcode_elvcast(vtmp[idx], eltype);
3568     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3569   }
3570   // index += 32;
3571   addl(index, 32);
3572   // index < bound;
3573   cmpl(index, bound);
3574   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3575   // }
3576 
3577   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3578   subl(cnt1, bound);
3579   // release bound
3580 
3581   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3582   for (int idx = 0; idx < 4; idx++) {
3583     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3584     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3585     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3586   }
3587   // result += vresult.reduceLanes(ADD);
3588   for (int idx = 0; idx < 4; idx++) {
3589     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3590   }
3591 
3592   // } else if (cnt1 < 32) {
3593 
3594   bind(SHORT_UNROLLED_BEGIN);
3595   // int i = 1;
3596   movl(index, 1);
3597   cmpl(index, cnt1);
3598   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3599 
3600   // for (; i < cnt1 ; i += 2) {
3601   bind(SHORT_UNROLLED_LOOP_BEGIN);
3602   movl(tmp3, 961);
3603   imull(result, tmp3);
3604   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3605   movl(tmp3, tmp2);
3606   shll(tmp3, 5);
3607   subl(tmp3, tmp2);
3608   addl(result, tmp3);
3609   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3610   addl(result, tmp3);
3611   addl(index, 2);
3612   cmpl(index, cnt1);
3613   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3614 
3615   // }
3616   // if (i >= cnt1) {
3617   bind(SHORT_UNROLLED_LOOP_EXIT);
3618   jccb(Assembler::greater, END);
3619   movl(tmp2, result);
3620   shll(result, 5);
3621   subl(result, tmp2);
3622   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3623   addl(result, tmp3);
3624   // }
3625   bind(END);
3626 
3627   BLOCK_COMMENT("} // arrays_hashcode");
3628 
3629 } // arrays_hashcode
3630 
3631 // helper function for string_compare
3632 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3633                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3634                                            Address::ScaleFactor scale2, Register index, int ae) {
3635   if (ae == StrIntrinsicNode::LL) {
3636     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3637     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3638   } else if (ae == StrIntrinsicNode::UU) {
3639     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3640     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3641   } else {
3642     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3643     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3644   }
3645 }
3646 
3647 // Compare strings, used for char[] and byte[].
3648 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3649                                        Register cnt1, Register cnt2, Register result,
3650                                        XMMRegister vec1, int ae, KRegister mask) {
3651   ShortBranchVerifier sbv(this);
3652   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3653   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only AVX3
3654   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3655   int stride2x2 = 0x40;
3656   Address::ScaleFactor scale = Address::no_scale;
3657   Address::ScaleFactor scale1 = Address::no_scale;
3658   Address::ScaleFactor scale2 = Address::no_scale;
3659 
3660   if (ae != StrIntrinsicNode::LL) {
3661     stride2x2 = 0x20;
3662   }
3663 
3664   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3665     shrl(cnt2, 1);
3666   }
3667   // Compute the minimum of the string lengths and the
3668   // difference of the string lengths (stack).
3669   // Do the conditional move stuff
3670   movl(result, cnt1);
3671   subl(cnt1, cnt2);
3672   push(cnt1);
3673   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3674 
3675   // Is the minimum length zero?
3676   testl(cnt2, cnt2);
3677   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3678   if (ae == StrIntrinsicNode::LL) {
3679     // Load first bytes
3680     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3681     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3682   } else if (ae == StrIntrinsicNode::UU) {
3683     // Load first characters
3684     load_unsigned_short(result, Address(str1, 0));
3685     load_unsigned_short(cnt1, Address(str2, 0));
3686   } else {
3687     load_unsigned_byte(result, Address(str1, 0));
3688     load_unsigned_short(cnt1, Address(str2, 0));
3689   }
3690   subl(result, cnt1);
3691   jcc(Assembler::notZero,  POP_LABEL);
3692 
3693   if (ae == StrIntrinsicNode::UU) {
3694     // Divide length by 2 to get number of chars
3695     shrl(cnt2, 1);
3696   }
3697   cmpl(cnt2, 1);
3698   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3699 
3700   // Check if the strings start at the same location and setup scale and stride
3701   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3702     cmpptr(str1, str2);
3703     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3704     if (ae == StrIntrinsicNode::LL) {
3705       scale = Address::times_1;
3706       stride = 16;
3707     } else {
3708       scale = Address::times_2;
3709       stride = 8;
3710     }
3711   } else {
3712     scale1 = Address::times_1;
3713     scale2 = Address::times_2;
3714     // scale not used
3715     stride = 8;
3716   }
3717 
3718   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3719     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3720     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3721     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3722     Label COMPARE_TAIL_LONG;
3723     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only AVX3
3724 
3725     int pcmpmask = 0x19;
3726     if (ae == StrIntrinsicNode::LL) {
3727       pcmpmask &= ~0x01;
3728     }
3729 
3730     // Setup to compare 16-chars (32-bytes) vectors,
3731     // start from first character again because it has aligned address.
3732     if (ae == StrIntrinsicNode::LL) {
3733       stride2 = 32;
3734     } else {
3735       stride2 = 16;
3736     }
3737     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3738       adr_stride = stride << scale;
3739     } else {
3740       adr_stride1 = 8;  //stride << scale1;
3741       adr_stride2 = 16; //stride << scale2;
3742     }
3743 
3744     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3745     // rax and rdx are used by pcmpestri as elements counters
3746     movl(result, cnt2);
3747     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3748     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3749 
3750     // fast path : compare first 2 8-char vectors.
3751     bind(COMPARE_16_CHARS);
3752     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3753       movdqu(vec1, Address(str1, 0));
3754     } else {
3755       pmovzxbw(vec1, Address(str1, 0));
3756     }
3757     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3758     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3759 
3760     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3761       movdqu(vec1, Address(str1, adr_stride));
3762       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3763     } else {
3764       pmovzxbw(vec1, Address(str1, adr_stride1));
3765       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3766     }
3767     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3768     addl(cnt1, stride);
3769 
3770     // Compare the characters at index in cnt1
3771     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3772     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3773     subl(result, cnt2);
3774     jmp(POP_LABEL);
3775 
3776     // Setup the registers to start vector comparison loop
3777     bind(COMPARE_WIDE_VECTORS);
3778     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3779       lea(str1, Address(str1, result, scale));
3780       lea(str2, Address(str2, result, scale));
3781     } else {
3782       lea(str1, Address(str1, result, scale1));
3783       lea(str2, Address(str2, result, scale2));
3784     }
3785     subl(result, stride2);
3786     subl(cnt2, stride2);
3787     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3788     negptr(result);
3789 
3790     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3791     bind(COMPARE_WIDE_VECTORS_LOOP);
3792 
3793     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3794       cmpl(cnt2, stride2x2);
3795       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3796       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3797       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3798 
3799       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3800       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3801         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3802         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3803       } else {
3804         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3805         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3806       }
3807       kortestql(mask, mask);
3808       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3809       addptr(result, stride2x2);  // update since we already compared at this addr
3810       subl(cnt2, stride2x2);      // and sub the size too
3811       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3812 
3813       vpxor(vec1, vec1);
3814       jmpb(COMPARE_WIDE_TAIL);
3815     }//if (VM_Version::supports_avx512vlbw())
3816 
3817     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3818     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3819       vmovdqu(vec1, Address(str1, result, scale));
3820       vpxor(vec1, Address(str2, result, scale));
3821     } else {
3822       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3823       vpxor(vec1, Address(str2, result, scale2));
3824     }
3825     vptest(vec1, vec1);
3826     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3827     addptr(result, stride2);
3828     subl(cnt2, stride2);
3829     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3830     // clean upper bits of YMM registers
3831     vpxor(vec1, vec1);
3832 
3833     // compare wide vectors tail
3834     bind(COMPARE_WIDE_TAIL);
3835     testptr(result, result);
3836     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3837 
3838     movl(result, stride2);
3839     movl(cnt2, result);
3840     negptr(result);
3841     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3842 
3843     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3844     bind(VECTOR_NOT_EQUAL);
3845     // clean upper bits of YMM registers
3846     vpxor(vec1, vec1);
3847     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3848       lea(str1, Address(str1, result, scale));
3849       lea(str2, Address(str2, result, scale));
3850     } else {
3851       lea(str1, Address(str1, result, scale1));
3852       lea(str2, Address(str2, result, scale2));
3853     }
3854     jmp(COMPARE_16_CHARS);
3855 
3856     // Compare tail chars, length between 1 to 15 chars
3857     bind(COMPARE_TAIL_LONG);
3858     movl(cnt2, result);
3859     cmpl(cnt2, stride);
3860     jcc(Assembler::less, COMPARE_SMALL_STR);
3861 
3862     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3863       movdqu(vec1, Address(str1, 0));
3864     } else {
3865       pmovzxbw(vec1, Address(str1, 0));
3866     }
3867     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3868     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3869     subptr(cnt2, stride);
3870     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3871     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3872       lea(str1, Address(str1, result, scale));
3873       lea(str2, Address(str2, result, scale));
3874     } else {
3875       lea(str1, Address(str1, result, scale1));
3876       lea(str2, Address(str2, result, scale2));
3877     }
3878     negptr(cnt2);
3879     jmpb(WHILE_HEAD_LABEL);
3880 
3881     bind(COMPARE_SMALL_STR);
3882   } else if (UseSSE42Intrinsics) {
3883     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3884     int pcmpmask = 0x19;
3885     // Setup to compare 8-char (16-byte) vectors,
3886     // start from first character again because it has aligned address.
3887     movl(result, cnt2);
3888     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3889     if (ae == StrIntrinsicNode::LL) {
3890       pcmpmask &= ~0x01;
3891     }
3892     jcc(Assembler::zero, COMPARE_TAIL);
3893     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3894       lea(str1, Address(str1, result, scale));
3895       lea(str2, Address(str2, result, scale));
3896     } else {
3897       lea(str1, Address(str1, result, scale1));
3898       lea(str2, Address(str2, result, scale2));
3899     }
3900     negptr(result);
3901 
3902     // pcmpestri
3903     //   inputs:
3904     //     vec1- substring
3905     //     rax - negative string length (elements count)
3906     //     mem - scanned string
3907     //     rdx - string length (elements count)
3908     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3909     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3910     //   outputs:
3911     //     rcx - first mismatched element index
3912     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3913 
3914     bind(COMPARE_WIDE_VECTORS);
3915     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3916       movdqu(vec1, Address(str1, result, scale));
3917       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3918     } else {
3919       pmovzxbw(vec1, Address(str1, result, scale1));
3920       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3921     }
3922     // After pcmpestri cnt1(rcx) contains mismatched element index
3923 
3924     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3925     addptr(result, stride);
3926     subptr(cnt2, stride);
3927     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3928 
3929     // compare wide vectors tail
3930     testptr(result, result);
3931     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3932 
3933     movl(cnt2, stride);
3934     movl(result, stride);
3935     negptr(result);
3936     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3937       movdqu(vec1, Address(str1, result, scale));
3938       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3939     } else {
3940       pmovzxbw(vec1, Address(str1, result, scale1));
3941       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3942     }
3943     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3944 
3945     // Mismatched characters in the vectors
3946     bind(VECTOR_NOT_EQUAL);
3947     addptr(cnt1, result);
3948     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3949     subl(result, cnt2);
3950     jmpb(POP_LABEL);
3951 
3952     bind(COMPARE_TAIL); // limit is zero
3953     movl(cnt2, result);
3954     // Fallthru to tail compare
3955   }
3956   // Shift str2 and str1 to the end of the arrays, negate min
3957   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3958     lea(str1, Address(str1, cnt2, scale));
3959     lea(str2, Address(str2, cnt2, scale));
3960   } else {
3961     lea(str1, Address(str1, cnt2, scale1));
3962     lea(str2, Address(str2, cnt2, scale2));
3963   }
3964   decrementl(cnt2);  // first character was compared already
3965   negptr(cnt2);
3966 
3967   // Compare the rest of the elements
3968   bind(WHILE_HEAD_LABEL);
3969   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3970   subl(result, cnt1);
3971   jccb(Assembler::notZero, POP_LABEL);
3972   increment(cnt2);
3973   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3974 
3975   // Strings are equal up to min length.  Return the length difference.
3976   bind(LENGTH_DIFF_LABEL);
3977   pop(result);
3978   if (ae == StrIntrinsicNode::UU) {
3979     // Divide diff by 2 to get number of chars
3980     sarl(result, 1);
3981   }
3982   jmpb(DONE_LABEL);
3983 
3984   if (VM_Version::supports_avx512vlbw()) {
3985 
3986     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3987 
3988     kmovql(cnt1, mask);
3989     notq(cnt1);
3990     bsfq(cnt2, cnt1);
3991     if (ae != StrIntrinsicNode::LL) {
3992       // Divide diff by 2 to get number of chars
3993       sarl(cnt2, 1);
3994     }
3995     addq(result, cnt2);
3996     if (ae == StrIntrinsicNode::LL) {
3997       load_unsigned_byte(cnt1, Address(str2, result));
3998       load_unsigned_byte(result, Address(str1, result));
3999     } else if (ae == StrIntrinsicNode::UU) {
4000       load_unsigned_short(cnt1, Address(str2, result, scale));
4001       load_unsigned_short(result, Address(str1, result, scale));
4002     } else {
4003       load_unsigned_short(cnt1, Address(str2, result, scale2));
4004       load_unsigned_byte(result, Address(str1, result, scale1));
4005     }
4006     subl(result, cnt1);
4007     jmpb(POP_LABEL);
4008   }//if (VM_Version::supports_avx512vlbw())
4009 
4010   // Discard the stored length difference
4011   bind(POP_LABEL);
4012   pop(cnt1);
4013 
4014   // That's it
4015   bind(DONE_LABEL);
4016   if(ae == StrIntrinsicNode::UL) {
4017     negl(result);
4018   }
4019 
4020 }
4021 
4022 // Search for Non-ASCII character (Negative byte value) in a byte array,
4023 // return the index of the first such character, otherwise the length
4024 // of the array segment searched.
4025 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4026 //   @IntrinsicCandidate
4027 //   public static int countPositives(byte[] ba, int off, int len) {
4028 //     for (int i = off; i < off + len; i++) {
4029 //       if (ba[i] < 0) {
4030 //         return i - off;
4031 //       }
4032 //     }
4033 //     return len;
4034 //   }
4035 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4036   Register result, Register tmp1,
4037   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4038   // rsi: byte array
4039   // rcx: len
4040   // rax: result
4041   ShortBranchVerifier sbv(this);
4042   assert_different_registers(ary1, len, result, tmp1);
4043   assert_different_registers(vec1, vec2);
4044   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4045 
4046   movl(result, len); // copy
4047   // len == 0
4048   testl(len, len);
4049   jcc(Assembler::zero, DONE);
4050 
4051   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4052     VM_Version::supports_avx512vlbw() &&
4053     VM_Version::supports_bmi2()) {
4054 
4055     Label test_64_loop, test_tail, BREAK_LOOP;
4056     movl(tmp1, len);
4057     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4058 
4059     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4060     andl(len,  0xffffffc0); // vector count (in chars)
4061     jccb(Assembler::zero, test_tail);
4062 
4063     lea(ary1, Address(ary1, len, Address::times_1));
4064     negptr(len);
4065 
4066     bind(test_64_loop);
4067     // Check whether our 64 elements of size byte contain negatives
4068     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4069     kortestql(mask1, mask1);
4070     jcc(Assembler::notZero, BREAK_LOOP);
4071 
4072     addptr(len, 64);
4073     jccb(Assembler::notZero, test_64_loop);
4074 
4075     bind(test_tail);
4076     // bail out when there is nothing to be done
4077     testl(tmp1, -1);
4078     jcc(Assembler::zero, DONE);
4079 
4080 
4081     // check the tail for absense of negatives
4082     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4083     {
4084       Register tmp3_aliased = len;
4085       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4086       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4087       notq(tmp3_aliased);
4088       kmovql(mask2, tmp3_aliased);
4089     }
4090 
4091     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4092     ktestq(mask1, mask2);
4093     jcc(Assembler::zero, DONE);
4094 
4095     // do a full check for negative registers in the tail
4096     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4097                      // ary1 already pointing to the right place
4098     jmpb(TAIL_START);
4099 
4100     bind(BREAK_LOOP);
4101     // At least one byte in the last 64 byte block was negative.
4102     // Set up to look at the last 64 bytes as if they were a tail
4103     lea(ary1, Address(ary1, len, Address::times_1));
4104     addptr(result, len);
4105     // Ignore the very last byte: if all others are positive,
4106     // it must be negative, so we can skip right to the 2+1 byte
4107     // end comparison at this point
4108     orl(result, 63);
4109     movl(len, 63);
4110     // Fallthru to tail compare
4111   } else {
4112 
4113     if (UseAVX >= 2) {
4114       // With AVX2, use 32-byte vector compare
4115       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4116 
4117       // Compare 32-byte vectors
4118       testl(len, 0xffffffe0);   // vector count (in bytes)
4119       jccb(Assembler::zero, TAIL_START);
4120 
4121       andl(len, 0xffffffe0);
4122       lea(ary1, Address(ary1, len, Address::times_1));
4123       negptr(len);
4124 
4125       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4126       movdl(vec2, tmp1);
4127       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4128 
4129       bind(COMPARE_WIDE_VECTORS);
4130       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4131       vptest(vec1, vec2);
4132       jccb(Assembler::notZero, BREAK_LOOP);
4133       addptr(len, 32);
4134       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4135 
4136       testl(result, 0x0000001f);   // any bytes remaining?
4137       jcc(Assembler::zero, DONE);
4138 
4139       // Quick test using the already prepared vector mask
4140       movl(len, result);
4141       andl(len, 0x0000001f);
4142       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4143       vptest(vec1, vec2);
4144       jcc(Assembler::zero, DONE);
4145       // There are zeros, jump to the tail to determine exactly where
4146       jmpb(TAIL_START);
4147 
4148       bind(BREAK_LOOP);
4149       // At least one byte in the last 32-byte vector is negative.
4150       // Set up to look at the last 32 bytes as if they were a tail
4151       lea(ary1, Address(ary1, len, Address::times_1));
4152       addptr(result, len);
4153       // Ignore the very last byte: if all others are positive,
4154       // it must be negative, so we can skip right to the 2+1 byte
4155       // end comparison at this point
4156       orl(result, 31);
4157       movl(len, 31);
4158       // Fallthru to tail compare
4159     } else if (UseSSE42Intrinsics) {
4160       // With SSE4.2, use double quad vector compare
4161       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4162 
4163       // Compare 16-byte vectors
4164       testl(len, 0xfffffff0);   // vector count (in bytes)
4165       jcc(Assembler::zero, TAIL_START);
4166 
4167       andl(len, 0xfffffff0);
4168       lea(ary1, Address(ary1, len, Address::times_1));
4169       negptr(len);
4170 
4171       movl(tmp1, 0x80808080);
4172       movdl(vec2, tmp1);
4173       pshufd(vec2, vec2, 0);
4174 
4175       bind(COMPARE_WIDE_VECTORS);
4176       movdqu(vec1, Address(ary1, len, Address::times_1));
4177       ptest(vec1, vec2);
4178       jccb(Assembler::notZero, BREAK_LOOP);
4179       addptr(len, 16);
4180       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4181 
4182       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4183       jcc(Assembler::zero, DONE);
4184 
4185       // Quick test using the already prepared vector mask
4186       movl(len, result);
4187       andl(len, 0x0000000f);   // tail count (in bytes)
4188       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4189       ptest(vec1, vec2);
4190       jcc(Assembler::zero, DONE);
4191       jmpb(TAIL_START);
4192 
4193       bind(BREAK_LOOP);
4194       // At least one byte in the last 16-byte vector is negative.
4195       // Set up and look at the last 16 bytes as if they were a tail
4196       lea(ary1, Address(ary1, len, Address::times_1));
4197       addptr(result, len);
4198       // Ignore the very last byte: if all others are positive,
4199       // it must be negative, so we can skip right to the 2+1 byte
4200       // end comparison at this point
4201       orl(result, 15);
4202       movl(len, 15);
4203       // Fallthru to tail compare
4204     }
4205   }
4206 
4207   bind(TAIL_START);
4208   // Compare 4-byte vectors
4209   andl(len, 0xfffffffc); // vector count (in bytes)
4210   jccb(Assembler::zero, COMPARE_CHAR);
4211 
4212   lea(ary1, Address(ary1, len, Address::times_1));
4213   negptr(len);
4214 
4215   bind(COMPARE_VECTORS);
4216   movl(tmp1, Address(ary1, len, Address::times_1));
4217   andl(tmp1, 0x80808080);
4218   jccb(Assembler::notZero, TAIL_ADJUST);
4219   addptr(len, 4);
4220   jccb(Assembler::notZero, COMPARE_VECTORS);
4221 
4222   // Compare trailing char (final 2-3 bytes), if any
4223   bind(COMPARE_CHAR);
4224 
4225   testl(result, 0x2);   // tail  char
4226   jccb(Assembler::zero, COMPARE_BYTE);
4227   load_unsigned_short(tmp1, Address(ary1, 0));
4228   andl(tmp1, 0x00008080);
4229   jccb(Assembler::notZero, CHAR_ADJUST);
4230   lea(ary1, Address(ary1, 2));
4231 
4232   bind(COMPARE_BYTE);
4233   testl(result, 0x1);   // tail  byte
4234   jccb(Assembler::zero, DONE);
4235   load_unsigned_byte(tmp1, Address(ary1, 0));
4236   testl(tmp1, 0x00000080);
4237   jccb(Assembler::zero, DONE);
4238   subptr(result, 1);
4239   jmpb(DONE);
4240 
4241   bind(TAIL_ADJUST);
4242   // there are negative bits in the last 4 byte block.
4243   // Adjust result and check the next three bytes
4244   addptr(result, len);
4245   orl(result, 3);
4246   lea(ary1, Address(ary1, len, Address::times_1));
4247   jmpb(COMPARE_CHAR);
4248 
4249   bind(CHAR_ADJUST);
4250   // We are looking at a char + optional byte tail, and found that one
4251   // of the bytes in the char is negative. Adjust the result, check the
4252   // first byte and readjust if needed.
4253   andl(result, 0xfffffffc);
4254   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4255   jccb(Assembler::notZero, DONE);
4256   addptr(result, 1);
4257 
4258   // That's it
4259   bind(DONE);
4260   if (UseAVX >= 2) {
4261     // clean upper bits of YMM registers
4262     vpxor(vec1, vec1);
4263     vpxor(vec2, vec2);
4264   }
4265 }
4266 
4267 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4268 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4269                                       Register limit, Register result, Register chr,
4270                                       XMMRegister vec1, XMMRegister vec2, bool is_char,
4271                                       KRegister mask, bool expand_ary2) {
4272   // for expand_ary2, limit is the (smaller) size of the second array.
4273   ShortBranchVerifier sbv(this);
4274   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4275 
4276   assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4277          "Expansion only implemented for AVX2");
4278 
4279   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4280   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4281 
4282   Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4283   int scaleIncr = expand_ary2 ? 8 : 16;
4284 
4285   if (is_array_equ) {
4286     // Check the input args
4287     cmpoop(ary1, ary2);
4288     jcc(Assembler::equal, TRUE_LABEL);
4289 
4290     // Need additional checks for arrays_equals.
4291     testptr(ary1, ary1);
4292     jcc(Assembler::zero, FALSE_LABEL);
4293     testptr(ary2, ary2);
4294     jcc(Assembler::zero, FALSE_LABEL);
4295 
4296     // Check the lengths
4297     movl(limit, Address(ary1, length_offset));
4298     cmpl(limit, Address(ary2, length_offset));
4299     jcc(Assembler::notEqual, FALSE_LABEL);
4300   }
4301 
4302   // count == 0
4303   testl(limit, limit);
4304   jcc(Assembler::zero, TRUE_LABEL);
4305 
4306   if (is_array_equ) {
4307     // Load array address
4308     lea(ary1, Address(ary1, base_offset));
4309     lea(ary2, Address(ary2, base_offset));
4310   }
4311 
4312   if (is_array_equ && is_char) {
4313     // arrays_equals when used for char[].
4314     shll(limit, 1);      // byte count != 0
4315   }
4316   movl(result, limit); // copy
4317 
4318   if (UseAVX >= 2) {
4319     // With AVX2, use 32-byte vector compare
4320     Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4321 
4322     // Compare 32-byte vectors
4323     if (expand_ary2) {
4324       andl(result, 0x0000000f);  //   tail count (in bytes)
4325       andl(limit, 0xfffffff0);   // vector count (in bytes)
4326       jcc(Assembler::zero, COMPARE_TAIL);
4327     } else {
4328       andl(result, 0x0000001f);  //   tail count (in bytes)
4329       andl(limit, 0xffffffe0);   // vector count (in bytes)
4330       jcc(Assembler::zero, COMPARE_TAIL_16);
4331     }
4332 
4333     lea(ary1, Address(ary1, limit, scaleFactor));
4334     lea(ary2, Address(ary2, limit, Address::times_1));
4335     negptr(limit);
4336 
4337     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4338       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4339 
4340       cmpl(limit, -64);
4341       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4342 
4343       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4344 
4345       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4346       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4347       kortestql(mask, mask);
4348       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4349       addptr(limit, 64);  // update since we already compared at this addr
4350       cmpl(limit, -64);
4351       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4352 
4353       // At this point we may still need to compare -limit+result bytes.
4354       // We could execute the next two instruction and just continue via non-wide path:
4355       //  cmpl(limit, 0);
4356       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4357       // But since we stopped at the points ary{1,2}+limit which are
4358       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4359       // (|limit| <= 32 and result < 32),
4360       // we may just compare the last 64 bytes.
4361       //
4362       addptr(result, -64);   // it is safe, bc we just came from this area
4363       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4364       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4365       kortestql(mask, mask);
4366       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4367 
4368       jmp(TRUE_LABEL);
4369 
4370       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4371 
4372     }//if (VM_Version::supports_avx512vlbw())
4373 
4374     bind(COMPARE_WIDE_VECTORS);
4375     vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4376     if (expand_ary2) {
4377       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4378     } else {
4379       vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4380     }
4381     vpxor(vec1, vec2);
4382 
4383     vptest(vec1, vec1);
4384     jcc(Assembler::notZero, FALSE_LABEL);
4385     addptr(limit, scaleIncr * 2);
4386     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4387 
4388     testl(result, result);
4389     jcc(Assembler::zero, TRUE_LABEL);
4390 
4391     vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4392     if (expand_ary2) {
4393       vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4394     } else {
4395       vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4396     }
4397     vpxor(vec1, vec2);
4398 
4399     vptest(vec1, vec1);
4400     jcc(Assembler::notZero, FALSE_LABEL);
4401     jmp(TRUE_LABEL);
4402 
4403     bind(COMPARE_TAIL_16); // limit is zero
4404     movl(limit, result);
4405 
4406     // Compare 16-byte chunks
4407     andl(result, 0x0000000f);  //   tail count (in bytes)
4408     andl(limit, 0xfffffff0);   // vector count (in bytes)
4409     jcc(Assembler::zero, COMPARE_TAIL);
4410 
4411     lea(ary1, Address(ary1, limit, scaleFactor));
4412     lea(ary2, Address(ary2, limit, Address::times_1));
4413     negptr(limit);
4414 
4415     bind(COMPARE_WIDE_VECTORS_16);
4416     movdqu(vec1, Address(ary1, limit, scaleFactor));
4417     if (expand_ary2) {
4418       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4419     } else {
4420       movdqu(vec2, Address(ary2, limit, Address::times_1));
4421     }
4422     pxor(vec1, vec2);
4423 
4424     ptest(vec1, vec1);
4425     jcc(Assembler::notZero, FALSE_LABEL);
4426     addptr(limit, scaleIncr);
4427     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4428 
4429     bind(COMPARE_TAIL); // limit is zero
4430     movl(limit, result);
4431     // Fallthru to tail compare
4432   } else if (UseSSE42Intrinsics) {
4433     // With SSE4.2, use double quad vector compare
4434     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4435 
4436     // Compare 16-byte vectors
4437     andl(result, 0x0000000f);  //   tail count (in bytes)
4438     andl(limit, 0xfffffff0);   // vector count (in bytes)
4439     jcc(Assembler::zero, COMPARE_TAIL);
4440 
4441     lea(ary1, Address(ary1, limit, Address::times_1));
4442     lea(ary2, Address(ary2, limit, Address::times_1));
4443     negptr(limit);
4444 
4445     bind(COMPARE_WIDE_VECTORS);
4446     movdqu(vec1, Address(ary1, limit, Address::times_1));
4447     movdqu(vec2, Address(ary2, limit, Address::times_1));
4448     pxor(vec1, vec2);
4449 
4450     ptest(vec1, vec1);
4451     jcc(Assembler::notZero, FALSE_LABEL);
4452     addptr(limit, 16);
4453     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4454 
4455     testl(result, result);
4456     jcc(Assembler::zero, TRUE_LABEL);
4457 
4458     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4459     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4460     pxor(vec1, vec2);
4461 
4462     ptest(vec1, vec1);
4463     jccb(Assembler::notZero, FALSE_LABEL);
4464     jmpb(TRUE_LABEL);
4465 
4466     bind(COMPARE_TAIL); // limit is zero
4467     movl(limit, result);
4468     // Fallthru to tail compare
4469   }
4470 
4471   // Compare 4-byte vectors
4472   if (expand_ary2) {
4473     testl(result, result);
4474     jccb(Assembler::zero, TRUE_LABEL);
4475   } else {
4476     andl(limit, 0xfffffffc); // vector count (in bytes)
4477     jccb(Assembler::zero, COMPARE_CHAR);
4478   }
4479 
4480   lea(ary1, Address(ary1, limit, scaleFactor));
4481   lea(ary2, Address(ary2, limit, Address::times_1));
4482   negptr(limit);
4483 
4484   bind(COMPARE_VECTORS);
4485   if (expand_ary2) {
4486     // There are no "vector" operations for bytes to shorts
4487     movzbl(chr, Address(ary2, limit, Address::times_1));
4488     cmpw(Address(ary1, limit, Address::times_2), chr);
4489     jccb(Assembler::notEqual, FALSE_LABEL);
4490     addptr(limit, 1);
4491     jcc(Assembler::notZero, COMPARE_VECTORS);
4492     jmp(TRUE_LABEL);
4493   } else {
4494     movl(chr, Address(ary1, limit, Address::times_1));
4495     cmpl(chr, Address(ary2, limit, Address::times_1));
4496     jccb(Assembler::notEqual, FALSE_LABEL);
4497     addptr(limit, 4);
4498     jcc(Assembler::notZero, COMPARE_VECTORS);
4499   }
4500 
4501   // Compare trailing char (final 2 bytes), if any
4502   bind(COMPARE_CHAR);
4503   testl(result, 0x2);   // tail  char
4504   jccb(Assembler::zero, COMPARE_BYTE);
4505   load_unsigned_short(chr, Address(ary1, 0));
4506   load_unsigned_short(limit, Address(ary2, 0));
4507   cmpl(chr, limit);
4508   jccb(Assembler::notEqual, FALSE_LABEL);
4509 
4510   if (is_array_equ && is_char) {
4511     bind(COMPARE_BYTE);
4512   } else {
4513     lea(ary1, Address(ary1, 2));
4514     lea(ary2, Address(ary2, 2));
4515 
4516     bind(COMPARE_BYTE);
4517     testl(result, 0x1);   // tail  byte
4518     jccb(Assembler::zero, TRUE_LABEL);
4519     load_unsigned_byte(chr, Address(ary1, 0));
4520     load_unsigned_byte(limit, Address(ary2, 0));
4521     cmpl(chr, limit);
4522     jccb(Assembler::notEqual, FALSE_LABEL);
4523   }
4524   bind(TRUE_LABEL);
4525   movl(result, 1);   // return true
4526   jmpb(DONE);
4527 
4528   bind(FALSE_LABEL);
4529   xorl(result, result); // return false
4530 
4531   // That's it
4532   bind(DONE);
4533   if (UseAVX >= 2) {
4534     // clean upper bits of YMM registers
4535     vpxor(vec1, vec1);
4536     vpxor(vec2, vec2);
4537   }
4538 }
4539 
4540 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4541 #define __ masm.
4542   Register dst = stub.data<0>();
4543   XMMRegister src = stub.data<1>();
4544   address target = stub.data<2>();
4545   __ bind(stub.entry());
4546   __ subptr(rsp, 8);
4547   __ movdbl(Address(rsp), src);
4548   __ call(RuntimeAddress(target));
4549   // APX REX2 encoding for pop(dst) increases the stub size by 1 byte.
4550   __ pop(dst);
4551   __ jmp(stub.continuation());
4552 #undef __
4553 }
4554 
4555 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4556   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4557   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4558 
4559   address slowpath_target;
4560   if (dst_bt == T_INT) {
4561     if (src_bt == T_FLOAT) {
4562       cvttss2sil(dst, src);
4563       cmpl(dst, 0x80000000);
4564       slowpath_target = StubRoutines::x86::f2i_fixup();
4565     } else {
4566       cvttsd2sil(dst, src);
4567       cmpl(dst, 0x80000000);
4568       slowpath_target = StubRoutines::x86::d2i_fixup();
4569     }
4570   } else {
4571     if (src_bt == T_FLOAT) {
4572       cvttss2siq(dst, src);
4573       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4574       slowpath_target = StubRoutines::x86::f2l_fixup();
4575     } else {
4576       cvttsd2siq(dst, src);
4577       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4578       slowpath_target = StubRoutines::x86::d2l_fixup();
4579     }
4580   }
4581 
4582   // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte.
4583   int max_size = 23 + (UseAPX ? 1 : 0);
4584   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath);
4585   jcc(Assembler::equal, stub->entry());
4586   bind(stub->continuation());
4587 }
4588 
4589 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4590                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4591   switch(ideal_opc) {
4592     case Op_LShiftVS:
4593       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4594     case Op_LShiftVI:
4595       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4596     case Op_LShiftVL:
4597       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4598     case Op_RShiftVS:
4599       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4600     case Op_RShiftVI:
4601       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4602     case Op_RShiftVL:
4603       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4604     case Op_URShiftVS:
4605       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4606     case Op_URShiftVI:
4607       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4608     case Op_URShiftVL:
4609       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4610     case Op_RotateRightV:
4611       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4612     case Op_RotateLeftV:
4613       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4614     default:
4615       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4616       break;
4617   }
4618 }
4619 
4620 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4621                                                XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4622   if (is_unsigned) {
4623     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4624   } else {
4625     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4626   }
4627 }
4628 
4629 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4630                                                       XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4631   switch (elem_bt) {
4632     case T_BYTE:
4633       if (ideal_opc == Op_SaturatingAddV) {
4634         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4635       } else {
4636         assert(ideal_opc == Op_SaturatingSubV, "");
4637         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4638       }
4639       break;
4640     case T_SHORT:
4641       if (ideal_opc == Op_SaturatingAddV) {
4642         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4643       } else {
4644         assert(ideal_opc == Op_SaturatingSubV, "");
4645         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4646       }
4647       break;
4648     default:
4649       fatal("Unsupported type %s", type2name(elem_bt));
4650       break;
4651   }
4652 }
4653 
4654 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4655                                                         XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4656   switch (elem_bt) {
4657     case T_BYTE:
4658       if (ideal_opc == Op_SaturatingAddV) {
4659         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4660       } else {
4661         assert(ideal_opc == Op_SaturatingSubV, "");
4662         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4663       }
4664       break;
4665     case T_SHORT:
4666       if (ideal_opc == Op_SaturatingAddV) {
4667         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4668       } else {
4669         assert(ideal_opc == Op_SaturatingSubV, "");
4670         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4671       }
4672       break;
4673     default:
4674       fatal("Unsupported type %s", type2name(elem_bt));
4675       break;
4676   }
4677 }
4678 
4679 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4680                                                Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4681   if (is_unsigned) {
4682     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4683   } else {
4684     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4685   }
4686 }
4687 
4688 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4689                                                       XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4690   switch (elem_bt) {
4691     case T_BYTE:
4692       if (ideal_opc == Op_SaturatingAddV) {
4693         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4694       } else {
4695         assert(ideal_opc == Op_SaturatingSubV, "");
4696         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4697       }
4698       break;
4699     case T_SHORT:
4700       if (ideal_opc == Op_SaturatingAddV) {
4701         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4702       } else {
4703         assert(ideal_opc == Op_SaturatingSubV, "");
4704         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4705       }
4706       break;
4707     default:
4708       fatal("Unsupported type %s", type2name(elem_bt));
4709       break;
4710   }
4711 }
4712 
4713 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4714                                                         XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4715   switch (elem_bt) {
4716     case T_BYTE:
4717       if (ideal_opc == Op_SaturatingAddV) {
4718         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4719       } else {
4720         assert(ideal_opc == Op_SaturatingSubV, "");
4721         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4722       }
4723       break;
4724     case T_SHORT:
4725       if (ideal_opc == Op_SaturatingAddV) {
4726         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4727       } else {
4728         assert(ideal_opc == Op_SaturatingSubV, "");
4729         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4730       }
4731       break;
4732     default:
4733       fatal("Unsupported type %s", type2name(elem_bt));
4734       break;
4735   }
4736 }
4737 
4738 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4739                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4740                                     bool is_varshift) {
4741   switch (ideal_opc) {
4742     case Op_AddVB:
4743       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4744     case Op_AddVS:
4745       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4746     case Op_AddVI:
4747       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4748     case Op_AddVL:
4749       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4750     case Op_AddVF:
4751       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4752     case Op_AddVD:
4753       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4754     case Op_SubVB:
4755       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4756     case Op_SubVS:
4757       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4758     case Op_SubVI:
4759       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4760     case Op_SubVL:
4761       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4762     case Op_SubVF:
4763       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4764     case Op_SubVD:
4765       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4766     case Op_MulVS:
4767       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4768     case Op_MulVI:
4769       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4770     case Op_MulVL:
4771       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4772     case Op_MulVF:
4773       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4774     case Op_MulVD:
4775       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4776     case Op_DivVF:
4777       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4778     case Op_DivVD:
4779       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4780     case Op_SqrtVF:
4781       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4782     case Op_SqrtVD:
4783       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4784     case Op_AbsVB:
4785       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4786     case Op_AbsVS:
4787       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4788     case Op_AbsVI:
4789       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4790     case Op_AbsVL:
4791       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4792     case Op_FmaVF:
4793       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4794     case Op_FmaVD:
4795       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4796     case Op_VectorRearrange:
4797       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4798     case Op_LShiftVS:
4799       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4800     case Op_LShiftVI:
4801       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4802     case Op_LShiftVL:
4803       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4804     case Op_RShiftVS:
4805       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4806     case Op_RShiftVI:
4807       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4808     case Op_RShiftVL:
4809       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4810     case Op_URShiftVS:
4811       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4812     case Op_URShiftVI:
4813       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4814     case Op_URShiftVL:
4815       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4816     case Op_RotateLeftV:
4817       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4818     case Op_RotateRightV:
4819       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4820     case Op_MaxV:
4821       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4822     case Op_MinV:
4823       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4824     case Op_UMinV:
4825       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4826     case Op_UMaxV:
4827       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4828     case Op_XorV:
4829       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4830     case Op_OrV:
4831       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4832     case Op_AndV:
4833       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4834     default:
4835       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4836       break;
4837   }
4838 }
4839 
4840 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4841                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4842   switch (ideal_opc) {
4843     case Op_AddVB:
4844       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4845     case Op_AddVS:
4846       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4847     case Op_AddVI:
4848       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4849     case Op_AddVL:
4850       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4851     case Op_AddVF:
4852       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4853     case Op_AddVD:
4854       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4855     case Op_SubVB:
4856       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4857     case Op_SubVS:
4858       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4859     case Op_SubVI:
4860       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4861     case Op_SubVL:
4862       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4863     case Op_SubVF:
4864       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4865     case Op_SubVD:
4866       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4867     case Op_MulVS:
4868       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4869     case Op_MulVI:
4870       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4871     case Op_MulVL:
4872       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4873     case Op_MulVF:
4874       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4875     case Op_MulVD:
4876       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4877     case Op_DivVF:
4878       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4879     case Op_DivVD:
4880       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4881     case Op_FmaVF:
4882       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4883     case Op_FmaVD:
4884       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4885     case Op_MaxV:
4886       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4887     case Op_MinV:
4888       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4889     case Op_UMaxV:
4890       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4891     case Op_UMinV:
4892       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4893     case Op_XorV:
4894       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4895     case Op_OrV:
4896       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4897     case Op_AndV:
4898       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4899     default:
4900       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4901       break;
4902   }
4903 }
4904 
4905 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4906                                   KRegister src1, KRegister src2) {
4907   BasicType etype = T_ILLEGAL;
4908   switch(mask_len) {
4909     case 2:
4910     case 4:
4911     case 8:  etype = T_BYTE; break;
4912     case 16: etype = T_SHORT; break;
4913     case 32: etype = T_INT; break;
4914     case 64: etype = T_LONG; break;
4915     default: fatal("Unsupported type"); break;
4916   }
4917   assert(etype != T_ILLEGAL, "");
4918   switch(ideal_opc) {
4919     case Op_AndVMask:
4920       kand(etype, dst, src1, src2); break;
4921     case Op_OrVMask:
4922       kor(etype, dst, src1, src2); break;
4923     case Op_XorVMask:
4924       kxor(etype, dst, src1, src2); break;
4925     default:
4926       fatal("Unsupported masked operation"); break;
4927   }
4928 }
4929 
4930 /*
4931  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4932  * If src is NaN, the result is 0.
4933  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4934  * the result is equal to the value of Integer.MIN_VALUE.
4935  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4936  * the result is equal to the value of Integer.MAX_VALUE.
4937  */
4938 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4939                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4940                                                                    Register rscratch, AddressLiteral float_sign_flip,
4941                                                                    int vec_enc) {
4942   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4943   Label done;
4944   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4945   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4946   vptest(xtmp2, xtmp2, vec_enc);
4947   jccb(Assembler::equal, done);
4948 
4949   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4950   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4951 
4952   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4953   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4954   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4955 
4956   // Recompute the mask for remaining special value.
4957   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4958   // Extract SRC values corresponding to TRUE mask lanes.
4959   vpand(xtmp4, xtmp2, src, vec_enc);
4960   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4961   // values are set.
4962   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4963 
4964   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4965   bind(done);
4966 }
4967 
4968 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4969                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4970                                                                     Register rscratch, AddressLiteral float_sign_flip,
4971                                                                     int vec_enc) {
4972   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4973   Label done;
4974   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4975   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4976   kortestwl(ktmp1, ktmp1);
4977   jccb(Assembler::equal, done);
4978 
4979   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4980   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4981   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4982 
4983   kxorwl(ktmp1, ktmp1, ktmp2);
4984   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4985   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4986   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4987   bind(done);
4988 }
4989 
4990 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4991                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4992                                                                      Register rscratch, AddressLiteral double_sign_flip,
4993                                                                      int vec_enc) {
4994   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4995 
4996   Label done;
4997   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4998   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4999   kortestwl(ktmp1, ktmp1);
5000   jccb(Assembler::equal, done);
5001 
5002   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5003   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5004   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5005 
5006   kxorwl(ktmp1, ktmp1, ktmp2);
5007   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5008   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5009   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5010   bind(done);
5011 }
5012 
5013 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5014                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5015                                                                      Register rscratch, AddressLiteral float_sign_flip,
5016                                                                      int vec_enc) {
5017   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5018   Label done;
5019   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5020   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5021   kortestwl(ktmp1, ktmp1);
5022   jccb(Assembler::equal, done);
5023 
5024   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5025   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5026   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5027 
5028   kxorwl(ktmp1, ktmp1, ktmp2);
5029   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5030   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5031   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5032   bind(done);
5033 }
5034 
5035 /*
5036  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5037  * If src is NaN, the result is 0.
5038  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
5039  * the result is equal to the value of Long.MIN_VALUE.
5040  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
5041  * the result is equal to the value of Long.MAX_VALUE.
5042  */
5043 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5044                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5045                                                                       Register rscratch, AddressLiteral double_sign_flip,
5046                                                                       int vec_enc) {
5047   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5048 
5049   Label done;
5050   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5051   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
5052   kortestwl(ktmp1, ktmp1);
5053   jccb(Assembler::equal, done);
5054 
5055   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5056   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5057   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5058 
5059   kxorwl(ktmp1, ktmp1, ktmp2);
5060   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5061   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5062   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5063   bind(done);
5064 }
5065 
5066 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5067                                                              XMMRegister xtmp, int index, int vec_enc) {
5068    assert(vec_enc < Assembler::AVX_512bit, "");
5069    if (vec_enc == Assembler::AVX_256bit) {
5070      vextractf128_high(xtmp, src);
5071      vshufps(dst, src, xtmp, index, vec_enc);
5072    } else {
5073      vshufps(dst, src, zero, index, vec_enc);
5074    }
5075 }
5076 
5077 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5078                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5079                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
5080   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5081 
5082   Label done;
5083   // Compare the destination lanes with float_sign_flip
5084   // value to get mask for all special values.
5085   movdqu(xtmp1, float_sign_flip, rscratch);
5086   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5087   ptest(xtmp2, xtmp2);
5088   jccb(Assembler::equal, done);
5089 
5090   // Flip float_sign_flip to get max integer value.
5091   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5092   pxor(xtmp1, xtmp4);
5093 
5094   // Set detination lanes corresponding to unordered source lanes as zero.
5095   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5096   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5097 
5098   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5099   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5100   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5101 
5102   // Recompute the mask for remaining special value.
5103   pxor(xtmp2, xtmp3);
5104   // Extract mask corresponding to non-negative source lanes.
5105   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5106 
5107   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5108   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5109   pand(xtmp3, xtmp2);
5110 
5111   // Replace destination lanes holding special value(0x80000000) with max int
5112   // if corresponding source lane holds a +ve value.
5113   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5114   bind(done);
5115 }
5116 
5117 
5118 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5119                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5120   switch(to_elem_bt) {
5121     case T_SHORT:
5122       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5123       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5124       vpackusdw(dst, dst, zero, vec_enc);
5125       if (vec_enc == Assembler::AVX_256bit) {
5126         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5127       }
5128       break;
5129     case  T_BYTE:
5130       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5131       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5132       vpackusdw(dst, dst, zero, vec_enc);
5133       if (vec_enc == Assembler::AVX_256bit) {
5134         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5135       }
5136       vpackuswb(dst, dst, zero, vec_enc);
5137       break;
5138     default: assert(false, "Unexpected basic type for target of vector cast int to subword: %s", type2name(to_elem_bt));
5139   }
5140 }
5141 
5142 /*
5143  * Algorithm for vector D2L and F2I conversions (AVX 10.2 unsupported):-
5144  * a) Perform vector D2L/F2I cast.
5145  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5146  *    It signifies that source value could be any of the special floating point
5147  *    values(NaN,-Inf,Inf,Max,-Min).
5148  * c) Set destination to zero if source is NaN value.
5149  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5150  */
5151 
5152 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5153                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5154                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5155   int to_elem_sz = type2aelembytes(to_elem_bt);
5156   assert(to_elem_sz <= 4, "");
5157   vcvttps2dq(dst, src, vec_enc);
5158   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5159   if (to_elem_sz < 4) {
5160     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5161     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5162   }
5163 }
5164 
5165 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5166                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5167                                             Register rscratch, int vec_enc) {
5168   int to_elem_sz = type2aelembytes(to_elem_bt);
5169   assert(to_elem_sz <= 4, "");
5170   vcvttps2dq(dst, src, vec_enc);
5171   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5172   switch(to_elem_bt) {
5173     case T_INT:
5174       break;
5175     case T_SHORT:
5176       evpmovdw(dst, dst, vec_enc);
5177       break;
5178     case T_BYTE:
5179       evpmovdb(dst, dst, vec_enc);
5180       break;
5181     default: assert(false, "Unexpected basic type for target of vector castF2X EVEX: %s", type2name(to_elem_bt));
5182   }
5183 }
5184 
5185 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5186                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5187                                             Register rscratch, int vec_enc) {
5188   evcvttps2qq(dst, src, vec_enc);
5189   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5190 }
5191 
5192 // Handling for downcasting from double to integer or sub-word types on AVX2.
5193 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5194                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5195                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5196   int to_elem_sz = type2aelembytes(to_elem_bt);
5197   assert(to_elem_sz < 8, "");
5198   vcvttpd2dq(dst, src, vec_enc);
5199   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5200                                               float_sign_flip, vec_enc);
5201   if (to_elem_sz < 4) {
5202     // xtmp4 holds all zero lanes.
5203     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5204   }
5205 }
5206 
5207 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5208                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5209                                             KRegister ktmp2, AddressLiteral sign_flip,
5210                                             Register rscratch, int vec_enc) {
5211   if (VM_Version::supports_avx512dq()) {
5212     evcvttpd2qq(dst, src, vec_enc);
5213     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5214     switch(to_elem_bt) {
5215       case T_LONG:
5216         break;
5217       case T_INT:
5218         evpmovsqd(dst, dst, vec_enc);
5219         break;
5220       case T_SHORT:
5221         evpmovsqd(dst, dst, vec_enc);
5222         evpmovdw(dst, dst, vec_enc);
5223         break;
5224       case T_BYTE:
5225         evpmovsqd(dst, dst, vec_enc);
5226         evpmovdb(dst, dst, vec_enc);
5227         break;
5228       default: assert(false, "Unexpected basic type for target of vector castD2X AVX512DQ EVEX: %s", type2name(to_elem_bt));
5229     }
5230   } else {
5231     assert(type2aelembytes(to_elem_bt) <= 4, "");
5232     vcvttpd2dq(dst, src, vec_enc);
5233     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5234     switch(to_elem_bt) {
5235       case T_INT:
5236         break;
5237       case T_SHORT:
5238         evpmovdw(dst, dst, vec_enc);
5239         break;
5240       case T_BYTE:
5241         evpmovdb(dst, dst, vec_enc);
5242         break;
5243       default: assert(false, "Unexpected basic type for target of vector castD2X EVEX: %s", type2name(to_elem_bt));
5244     }
5245   }
5246 }
5247 
5248 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5249   switch(to_elem_bt) {
5250     case T_LONG:
5251       evcvttps2qqs(dst, src, vec_enc);
5252       break;
5253     case T_INT:
5254       evcvttps2dqs(dst, src, vec_enc);
5255       break;
5256     case T_SHORT:
5257       evcvttps2dqs(dst, src, vec_enc);
5258       evpmovdw(dst, dst, vec_enc);
5259       break;
5260     case T_BYTE:
5261       evcvttps2dqs(dst, src, vec_enc);
5262       evpmovdb(dst, dst, vec_enc);
5263       break;
5264     default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (reg src): %s", type2name(to_elem_bt));
5265   }
5266 }
5267 
5268 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5269   switch(to_elem_bt) {
5270     case T_LONG:
5271       evcvttps2qqs(dst, src, vec_enc);
5272       break;
5273     case T_INT:
5274       evcvttps2dqs(dst, src, vec_enc);
5275       break;
5276     case T_SHORT:
5277       evcvttps2dqs(dst, src, vec_enc);
5278       evpmovdw(dst, dst, vec_enc);
5279       break;
5280     case T_BYTE:
5281       evcvttps2dqs(dst, src, vec_enc);
5282       evpmovdb(dst, dst, vec_enc);
5283       break;
5284     default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (mem src): %s", type2name(to_elem_bt));
5285   }
5286 }
5287 
5288 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5289   switch(to_elem_bt) {
5290     case T_LONG:
5291       evcvttpd2qqs(dst, src, vec_enc);
5292       break;
5293     case T_INT:
5294       evcvttpd2dqs(dst, src, vec_enc);
5295       break;
5296     case T_SHORT:
5297       evcvttpd2dqs(dst, src, vec_enc);
5298       evpmovdw(dst, dst, vec_enc);
5299       break;
5300     case T_BYTE:
5301       evcvttpd2dqs(dst, src, vec_enc);
5302       evpmovdb(dst, dst, vec_enc);
5303       break;
5304     default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (reg src): %s", type2name(to_elem_bt));
5305   }
5306 }
5307 
5308 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5309   switch(to_elem_bt) {
5310     case T_LONG:
5311       evcvttpd2qqs(dst, src, vec_enc);
5312       break;
5313     case T_INT:
5314       evcvttpd2dqs(dst, src, vec_enc);
5315       break;
5316     case T_SHORT:
5317       evcvttpd2dqs(dst, src, vec_enc);
5318       evpmovdw(dst, dst, vec_enc);
5319       break;
5320     case T_BYTE:
5321       evcvttpd2dqs(dst, src, vec_enc);
5322       evpmovdb(dst, dst, vec_enc);
5323       break;
5324     default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (mem src): %s", type2name(to_elem_bt));
5325   }
5326 }
5327 
5328 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5329                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5330                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5331   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5332   // and re-instantiate original MXCSR.RC mode after that.
5333   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5334 
5335   mov64(tmp, julong_cast(0.5L));
5336   evpbroadcastq(xtmp1, tmp, vec_enc);
5337   vaddpd(xtmp1, src , xtmp1, vec_enc);
5338   evcvtpd2qq(dst, xtmp1, vec_enc);
5339   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5340                                                 double_sign_flip, vec_enc);;
5341 
5342   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5343 }
5344 
5345 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5346                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5347                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5348   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5349   // and re-instantiate original MXCSR.RC mode after that.
5350   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5351 
5352   movl(tmp, jint_cast(0.5));
5353   movq(xtmp1, tmp);
5354   vbroadcastss(xtmp1, xtmp1, vec_enc);
5355   vaddps(xtmp1, src , xtmp1, vec_enc);
5356   vcvtps2dq(dst, xtmp1, vec_enc);
5357   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5358                                               float_sign_flip, vec_enc);
5359 
5360   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5361 }
5362 
5363 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5364                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5365                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5366   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5367   // and re-instantiate original MXCSR.RC mode after that.
5368   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5369 
5370   movl(tmp, jint_cast(0.5));
5371   movq(xtmp1, tmp);
5372   vbroadcastss(xtmp1, xtmp1, vec_enc);
5373   vaddps(xtmp1, src , xtmp1, vec_enc);
5374   vcvtps2dq(dst, xtmp1, vec_enc);
5375   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5376 
5377   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5378 }
5379 
5380 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5381                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5382   switch (from_elem_bt) {
5383     case T_BYTE:
5384       switch (to_elem_bt) {
5385         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5386         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5387         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5388         default: ShouldNotReachHere();
5389       }
5390       break;
5391     case T_SHORT:
5392       switch (to_elem_bt) {
5393         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5394         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5395         default: ShouldNotReachHere();
5396       }
5397       break;
5398     case T_INT:
5399       assert(to_elem_bt == T_LONG, "");
5400       vpmovzxdq(dst, src, vlen_enc);
5401       break;
5402     default:
5403       ShouldNotReachHere();
5404   }
5405 }
5406 
5407 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5408                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5409   switch (from_elem_bt) {
5410     case T_BYTE:
5411       switch (to_elem_bt) {
5412         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5413         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5414         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5415         default: ShouldNotReachHere();
5416       }
5417       break;
5418     case T_SHORT:
5419       switch (to_elem_bt) {
5420         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5421         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5422         default: ShouldNotReachHere();
5423       }
5424       break;
5425     case T_INT:
5426       assert(to_elem_bt == T_LONG, "");
5427       vpmovsxdq(dst, src, vlen_enc);
5428       break;
5429     default:
5430       ShouldNotReachHere();
5431   }
5432 }
5433 
5434 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5435                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5436   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5437   assert(vlen_enc != AVX_512bit, "");
5438 
5439   int dst_bt_size = type2aelembytes(dst_bt);
5440   int src_bt_size = type2aelembytes(src_bt);
5441   if (dst_bt_size > src_bt_size) {
5442     switch (dst_bt_size / src_bt_size) {
5443       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5444       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5445       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5446       default: ShouldNotReachHere();
5447     }
5448   } else {
5449     assert(dst_bt_size < src_bt_size, "");
5450     switch (src_bt_size / dst_bt_size) {
5451       case 2: {
5452         if (vlen_enc == AVX_128bit) {
5453           vpacksswb(dst, src, src, vlen_enc);
5454         } else {
5455           vpacksswb(dst, src, src, vlen_enc);
5456           vpermq(dst, dst, 0x08, vlen_enc);
5457         }
5458         break;
5459       }
5460       case 4: {
5461         if (vlen_enc == AVX_128bit) {
5462           vpackssdw(dst, src, src, vlen_enc);
5463           vpacksswb(dst, dst, dst, vlen_enc);
5464         } else {
5465           vpackssdw(dst, src, src, vlen_enc);
5466           vpermq(dst, dst, 0x08, vlen_enc);
5467           vpacksswb(dst, dst, dst, AVX_128bit);
5468         }
5469         break;
5470       }
5471       case 8: {
5472         if (vlen_enc == AVX_128bit) {
5473           vpshufd(dst, src, 0x08, vlen_enc);
5474           vpackssdw(dst, dst, dst, vlen_enc);
5475           vpacksswb(dst, dst, dst, vlen_enc);
5476         } else {
5477           vpshufd(dst, src, 0x08, vlen_enc);
5478           vpermq(dst, dst, 0x08, vlen_enc);
5479           vpackssdw(dst, dst, dst, AVX_128bit);
5480           vpacksswb(dst, dst, dst, AVX_128bit);
5481         }
5482         break;
5483       }
5484       default: ShouldNotReachHere();
5485     }
5486   }
5487 }
5488 
5489 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5490                                    bool merge, BasicType bt, int vlen_enc) {
5491   if (bt == T_INT) {
5492     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5493   } else {
5494     assert(bt == T_LONG, "");
5495     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5496   }
5497 }
5498 
5499 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5500                                    bool merge, BasicType bt, int vlen_enc) {
5501   if (bt == T_INT) {
5502     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5503   } else {
5504     assert(bt == T_LONG, "");
5505     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5506   }
5507 }
5508 
5509 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5510                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5511                                                int vec_enc) {
5512   int index = 0;
5513   int vindex = 0;
5514   mov64(rtmp1, 0x0101010101010101L);
5515   pdepq(rtmp1, src, rtmp1);
5516   if (mask_len > 8) {
5517     movq(rtmp2, src);
5518     vpxor(xtmp, xtmp, xtmp, vec_enc);
5519     movq(xtmp, rtmp1);
5520   }
5521   movq(dst, rtmp1);
5522 
5523   mask_len -= 8;
5524   while (mask_len > 0) {
5525     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5526     index++;
5527     if ((index % 2) == 0) {
5528       pxor(xtmp, xtmp);
5529     }
5530     mov64(rtmp1, 0x0101010101010101L);
5531     shrq(rtmp2, 8);
5532     pdepq(rtmp1, rtmp2, rtmp1);
5533     pinsrq(xtmp, rtmp1, index % 2);
5534     vindex = index / 2;
5535     if (vindex) {
5536       // Write entire 16 byte vector when both 64 bit
5537       // lanes are update to save redundant instructions.
5538       if (index % 2) {
5539         vinsertf128(dst, dst, xtmp, vindex);
5540       }
5541     } else {
5542       vmovdqu(dst, xtmp);
5543     }
5544     mask_len -= 8;
5545   }
5546 }
5547 
5548 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5549   switch(opc) {
5550     case Op_VectorMaskTrueCount:
5551       popcntq(dst, tmp);
5552       break;
5553     case Op_VectorMaskLastTrue:
5554       if (VM_Version::supports_lzcnt()) {
5555         lzcntq(tmp, tmp);
5556         movl(dst, 63);
5557         subl(dst, tmp);
5558       } else {
5559         movl(dst, -1);
5560         bsrq(tmp, tmp);
5561         cmov32(Assembler::notZero, dst, tmp);
5562       }
5563       break;
5564     case Op_VectorMaskFirstTrue:
5565       if (VM_Version::supports_bmi1()) {
5566         if (masklen < 32) {
5567           orl(tmp, 1 << masklen);
5568           tzcntl(dst, tmp);
5569         } else if (masklen == 32) {
5570           tzcntl(dst, tmp);
5571         } else {
5572           assert(masklen == 64, "");
5573           tzcntq(dst, tmp);
5574         }
5575       } else {
5576         if (masklen < 32) {
5577           orl(tmp, 1 << masklen);
5578           bsfl(dst, tmp);
5579         } else {
5580           assert(masklen == 32 || masklen == 64, "");
5581           movl(dst, masklen);
5582           if (masklen == 32)  {
5583             bsfl(tmp, tmp);
5584           } else {
5585             bsfq(tmp, tmp);
5586           }
5587           cmov32(Assembler::notZero, dst, tmp);
5588         }
5589       }
5590       break;
5591     case Op_VectorMaskToLong:
5592       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5593       break;
5594     default: assert(false, "Unhandled mask operation");
5595   }
5596 }
5597 
5598 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5599                                               int masklen, int masksize, int vec_enc) {
5600   assert(VM_Version::supports_popcnt(), "");
5601 
5602   if(VM_Version::supports_avx512bw()) {
5603     kmovql(tmp, mask);
5604   } else {
5605     assert(masklen <= 16, "");
5606     kmovwl(tmp, mask);
5607   }
5608 
5609   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5610   // operations needs to be clipped.
5611   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5612     andq(tmp, (1 << masklen) - 1);
5613   }
5614 
5615   vector_mask_operation_helper(opc, dst, tmp, masklen);
5616 }
5617 
5618 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5619                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5620   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5621          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5622   assert(VM_Version::supports_popcnt(), "");
5623 
5624   bool need_clip = false;
5625   switch(bt) {
5626     case T_BOOLEAN:
5627       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5628       vpxor(xtmp, xtmp, xtmp, vec_enc);
5629       vpsubb(xtmp, xtmp, mask, vec_enc);
5630       vpmovmskb(tmp, xtmp, vec_enc);
5631       need_clip = masklen < 16;
5632       break;
5633     case T_BYTE:
5634       vpmovmskb(tmp, mask, vec_enc);
5635       need_clip = masklen < 16;
5636       break;
5637     case T_SHORT:
5638       vpacksswb(xtmp, mask, mask, vec_enc);
5639       if (masklen >= 16) {
5640         vpermpd(xtmp, xtmp, 8, vec_enc);
5641       }
5642       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5643       need_clip = masklen < 16;
5644       break;
5645     case T_INT:
5646     case T_FLOAT:
5647       vmovmskps(tmp, mask, vec_enc);
5648       need_clip = masklen < 4;
5649       break;
5650     case T_LONG:
5651     case T_DOUBLE:
5652       vmovmskpd(tmp, mask, vec_enc);
5653       need_clip = masklen < 2;
5654       break;
5655     default: assert(false, "Unhandled type, %s", type2name(bt));
5656   }
5657 
5658   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5659   // operations needs to be clipped.
5660   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5661     // need_clip implies masklen < 32
5662     andq(tmp, (1 << masklen) - 1);
5663   }
5664 
5665   vector_mask_operation_helper(opc, dst, tmp, masklen);
5666 }
5667 
5668 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5669                                              Register rtmp2, int mask_len) {
5670   kmov(rtmp1, src);
5671   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5672   mov64(rtmp2, -1L);
5673   pextq(rtmp2, rtmp2, rtmp1);
5674   kmov(dst, rtmp2);
5675 }
5676 
5677 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5678                                                     XMMRegister mask, Register rtmp, Register rscratch,
5679                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5680                                                     int vec_enc) {
5681   assert(type2aelembytes(bt) >= 4, "");
5682   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5683   address compress_perm_table = nullptr;
5684   address expand_perm_table = nullptr;
5685   if (type2aelembytes(bt) == 8) {
5686     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5687     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5688     vmovmskpd(rtmp, mask, vec_enc);
5689   } else {
5690     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5691     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5692     vmovmskps(rtmp, mask, vec_enc);
5693   }
5694   shlq(rtmp, 5); // for 32 byte permute row.
5695   if (opcode == Op_CompressV) {
5696     lea(rscratch, ExternalAddress(compress_perm_table));
5697   } else {
5698     lea(rscratch, ExternalAddress(expand_perm_table));
5699   }
5700   addptr(rtmp, rscratch);
5701   vmovdqu(permv, Address(rtmp));
5702   vpermps(dst, permv, src, Assembler::AVX_256bit);
5703   vpxor(xtmp, xtmp, xtmp, vec_enc);
5704   // Blend the result with zero vector using permute mask, each column entry
5705   // in a permute table row contains either a valid permute index or a -1 (default)
5706   // value, this can potentially be used as a blending mask after
5707   // compressing/expanding the source vector lanes.
5708   vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv);
5709 }
5710 
5711 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5712                                                bool merge, BasicType bt, int vec_enc) {
5713   if (opcode == Op_CompressV) {
5714     switch(bt) {
5715     case T_BYTE:
5716       evpcompressb(dst, mask, src, merge, vec_enc);
5717       break;
5718     case T_CHAR:
5719     case T_SHORT:
5720       evpcompressw(dst, mask, src, merge, vec_enc);
5721       break;
5722     case T_INT:
5723       evpcompressd(dst, mask, src, merge, vec_enc);
5724       break;
5725     case T_FLOAT:
5726       evcompressps(dst, mask, src, merge, vec_enc);
5727       break;
5728     case T_LONG:
5729       evpcompressq(dst, mask, src, merge, vec_enc);
5730       break;
5731     case T_DOUBLE:
5732       evcompresspd(dst, mask, src, merge, vec_enc);
5733       break;
5734     default:
5735       fatal("Unsupported type %s", type2name(bt));
5736       break;
5737     }
5738   } else {
5739     assert(opcode == Op_ExpandV, "");
5740     switch(bt) {
5741     case T_BYTE:
5742       evpexpandb(dst, mask, src, merge, vec_enc);
5743       break;
5744     case T_CHAR:
5745     case T_SHORT:
5746       evpexpandw(dst, mask, src, merge, vec_enc);
5747       break;
5748     case T_INT:
5749       evpexpandd(dst, mask, src, merge, vec_enc);
5750       break;
5751     case T_FLOAT:
5752       evexpandps(dst, mask, src, merge, vec_enc);
5753       break;
5754     case T_LONG:
5755       evpexpandq(dst, mask, src, merge, vec_enc);
5756       break;
5757     case T_DOUBLE:
5758       evexpandpd(dst, mask, src, merge, vec_enc);
5759       break;
5760     default:
5761       fatal("Unsupported type %s", type2name(bt));
5762       break;
5763     }
5764   }
5765 }
5766 
5767 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5768                                            KRegister ktmp1, int vec_enc) {
5769   if (opcode == Op_SignumVD) {
5770     vsubpd(dst, zero, one, vec_enc);
5771     // if src < 0 ? -1 : 1
5772     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5773     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5774     // if src == NaN, -0.0 or 0.0 return src.
5775     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5776     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5777   } else {
5778     assert(opcode == Op_SignumVF, "");
5779     vsubps(dst, zero, one, vec_enc);
5780     // if src < 0 ? -1 : 1
5781     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5782     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5783     // if src == NaN, -0.0 or 0.0 return src.
5784     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5785     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5786   }
5787 }
5788 
5789 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5790                                           XMMRegister xtmp1, int vec_enc) {
5791   if (opcode == Op_SignumVD) {
5792     vsubpd(dst, zero, one, vec_enc);
5793     // if src < 0 ? -1 : 1
5794     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5795     // if src == NaN, -0.0 or 0.0 return src.
5796     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5797     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5798   } else {
5799     assert(opcode == Op_SignumVF, "");
5800     vsubps(dst, zero, one, vec_enc);
5801     // if src < 0 ? -1 : 1
5802     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5803     // if src == NaN, -0.0 or 0.0 return src.
5804     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5805     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5806   }
5807 }
5808 
5809 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5810   if (VM_Version::supports_avx512bw()) {
5811     if (mask_len > 32) {
5812       kmovql(dst, src);
5813     } else {
5814       kmovdl(dst, src);
5815       if (mask_len != 32) {
5816         kshiftrdl(dst, dst, 32 - mask_len);
5817       }
5818     }
5819   } else {
5820     assert(mask_len <= 16, "");
5821     kmovwl(dst, src);
5822     if (mask_len != 16) {
5823       kshiftrwl(dst, dst, 16 - mask_len);
5824     }
5825   }
5826 }
5827 
5828 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5829   int lane_size = type2aelembytes(bt);
5830   if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5831       (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) {
5832     movptr(rtmp, imm32);
5833     switch(lane_size) {
5834       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5835       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5836       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5837       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5838       fatal("Unsupported lane size %d", lane_size);
5839       break;
5840     }
5841   } else {
5842     movptr(rtmp, imm32);
5843     movq(dst, rtmp);
5844     switch(lane_size) {
5845       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5846       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5847       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5848       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5849       fatal("Unsupported lane size %d", lane_size);
5850       break;
5851     }
5852   }
5853 }
5854 
5855 //
5856 // Following is lookup table based popcount computation algorithm:-
5857 //       Index   Bit set count
5858 //     [ 0000 ->   0,
5859 //       0001 ->   1,
5860 //       0010 ->   1,
5861 //       0011 ->   2,
5862 //       0100 ->   1,
5863 //       0101 ->   2,
5864 //       0110 ->   2,
5865 //       0111 ->   3,
5866 //       1000 ->   1,
5867 //       1001 ->   2,
5868 //       1010 ->   3,
5869 //       1011 ->   3,
5870 //       1100 ->   2,
5871 //       1101 ->   3,
5872 //       1111 ->   4 ]
5873 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5874 //     shuffle indices for lookup table access.
5875 //  b. Right shift each byte of vector lane by 4 positions.
5876 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5877 //     shuffle indices for lookup table access.
5878 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5879 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5880 //     count of all the bytes of a quadword.
5881 //  f. Perform step e. for upper 128bit vector lane.
5882 //  g. Pack the bitset count of quadwords back to double word.
5883 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5884 
5885 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5886                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5887   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5888   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5889   vpsrlw(dst, src, 4, vec_enc);
5890   vpand(dst, dst, xtmp1, vec_enc);
5891   vpand(xtmp1, src, xtmp1, vec_enc);
5892   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5893   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5894   vpshufb(dst, xtmp2, dst, vec_enc);
5895   vpaddb(dst, dst, xtmp1, vec_enc);
5896 }
5897 
5898 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5899                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5900   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5901   // Following code is as per steps e,f,g and h of above algorithm.
5902   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5903   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5904   vpsadbw(dst, dst, xtmp2, vec_enc);
5905   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5906   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5907   vpackuswb(dst, xtmp1, dst, vec_enc);
5908 }
5909 
5910 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5911                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5912   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5913   // Add the popcount of upper and lower bytes of word.
5914   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5915   vpsrlw(dst, xtmp1, 8, vec_enc);
5916   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5917   vpaddw(dst, dst, xtmp1, vec_enc);
5918 }
5919 
5920 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5921                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5922   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5923   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5924   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5925 }
5926 
5927 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5928                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5929   switch(bt) {
5930     case T_LONG:
5931       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5932       break;
5933     case T_INT:
5934       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5935       break;
5936     case T_CHAR:
5937     case T_SHORT:
5938       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5939       break;
5940     case T_BYTE:
5941     case T_BOOLEAN:
5942       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5943       break;
5944     default:
5945       fatal("Unsupported type %s", type2name(bt));
5946       break;
5947   }
5948 }
5949 
5950 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5951                                                       KRegister mask, bool merge, int vec_enc) {
5952   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5953   switch(bt) {
5954     case T_LONG:
5955       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5956       evpopcntq(dst, mask, src, merge, vec_enc);
5957       break;
5958     case T_INT:
5959       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5960       evpopcntd(dst, mask, src, merge, vec_enc);
5961       break;
5962     case T_CHAR:
5963     case T_SHORT:
5964       assert(VM_Version::supports_avx512_bitalg(), "");
5965       evpopcntw(dst, mask, src, merge, vec_enc);
5966       break;
5967     case T_BYTE:
5968     case T_BOOLEAN:
5969       assert(VM_Version::supports_avx512_bitalg(), "");
5970       evpopcntb(dst, mask, src, merge, vec_enc);
5971       break;
5972     default:
5973       fatal("Unsupported type %s", type2name(bt));
5974       break;
5975   }
5976 }
5977 
5978 // Bit reversal algorithm first reverses the bits of each byte followed by
5979 // a byte level reversal for multi-byte primitive types (short/int/long).
5980 // Algorithm performs a lookup table access to get reverse bit sequence
5981 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5982 // is obtained by swapping the reverse bit sequences of upper and lower
5983 // nibble of a byte.
5984 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5985                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5986   if (VM_Version::supports_avx512vlbw()) {
5987 
5988     // Get the reverse bit sequence of lower nibble of each byte.
5989     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5990     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5991     evpandq(dst, xtmp2, src, vec_enc);
5992     vpshufb(dst, xtmp1, dst, vec_enc);
5993     vpsllq(dst, dst, 4, vec_enc);
5994 
5995     // Get the reverse bit sequence of upper nibble of each byte.
5996     vpandn(xtmp2, xtmp2, src, vec_enc);
5997     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5998     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5999 
6000     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6001     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6002     evporq(xtmp2, dst, xtmp2, vec_enc);
6003     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6004 
6005   } else if(vec_enc == Assembler::AVX_512bit) {
6006     // Shift based bit reversal.
6007     assert(bt == T_LONG || bt == T_INT, "");
6008 
6009     // Swap lower and upper nibble of each byte.
6010     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
6011 
6012     // Swap two least and most significant bits of each nibble.
6013     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
6014 
6015     // Swap adjacent pair of bits.
6016     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6017     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
6018 
6019     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6020     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
6021   } else {
6022     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
6023     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6024 
6025     // Get the reverse bit sequence of lower nibble of each byte.
6026     vpand(dst, xtmp2, src, vec_enc);
6027     vpshufb(dst, xtmp1, dst, vec_enc);
6028     vpsllq(dst, dst, 4, vec_enc);
6029 
6030     // Get the reverse bit sequence of upper nibble of each byte.
6031     vpandn(xtmp2, xtmp2, src, vec_enc);
6032     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6033     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6034 
6035     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6036     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6037     vpor(xtmp2, dst, xtmp2, vec_enc);
6038     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6039   }
6040 }
6041 
6042 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
6043                                                 XMMRegister xtmp, Register rscratch) {
6044   assert(VM_Version::supports_gfni(), "");
6045   assert(rscratch != noreg || always_reachable(mask), "missing");
6046 
6047   // Galois field instruction based bit reversal based on following algorithm.
6048   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6049   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
6050   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
6051   vector_reverse_byte(bt, dst, xtmp, vec_enc);
6052 }
6053 
6054 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
6055                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
6056   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
6057   evpandq(dst, xtmp1, src, vec_enc);
6058   vpsllq(dst, dst, nbits, vec_enc);
6059   vpandn(xtmp1, xtmp1, src, vec_enc);
6060   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
6061   evporq(dst, dst, xtmp1, vec_enc);
6062 }
6063 
6064 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6065                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
6066   // Shift based bit reversal.
6067   assert(VM_Version::supports_evex(), "");
6068   switch(bt) {
6069     case T_LONG:
6070       // Swap upper and lower double word of each quad word.
6071       evprorq(xtmp1, k0, src, 32, true, vec_enc);
6072       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
6073       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6074       break;
6075     case T_INT:
6076       // Swap upper and lower word of each double word.
6077       evprord(xtmp1, k0, src, 16, true, vec_enc);
6078       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6079       break;
6080     case T_CHAR:
6081     case T_SHORT:
6082       // Swap upper and lower byte of each word.
6083       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6084       break;
6085     case T_BYTE:
6086       evmovdquq(dst, k0, src, true, vec_enc);
6087       break;
6088     default:
6089       fatal("Unsupported type %s", type2name(bt));
6090       break;
6091   }
6092 }
6093 
6094 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6095   if (bt == T_BYTE) {
6096     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6097       evmovdquq(dst, k0, src, true, vec_enc);
6098     } else {
6099       vmovdqu(dst, src);
6100     }
6101     return;
6102   }
6103   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6104   // pre-computed shuffle indices.
6105   switch(bt) {
6106     case T_LONG:
6107       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6108       break;
6109     case T_INT:
6110       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6111       break;
6112     case T_CHAR:
6113     case T_SHORT:
6114       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6115       break;
6116     default:
6117       fatal("Unsupported type %s", type2name(bt));
6118       break;
6119   }
6120   vpshufb(dst, src, dst, vec_enc);
6121 }
6122 
6123 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6124                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6125                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6126   assert(is_integral_type(bt), "");
6127   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6128   assert(VM_Version::supports_avx512cd(), "");
6129   switch(bt) {
6130     case T_LONG:
6131       evplzcntq(dst, ktmp, src, merge, vec_enc);
6132       break;
6133     case T_INT:
6134       evplzcntd(dst, ktmp, src, merge, vec_enc);
6135       break;
6136     case T_SHORT:
6137       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6138       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6139       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6140       vpunpckhwd(dst, xtmp1, src, vec_enc);
6141       evplzcntd(dst, ktmp, dst, merge, vec_enc);
6142       vpackusdw(dst, xtmp2, dst, vec_enc);
6143       break;
6144     case T_BYTE:
6145       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6146       // accessing the lookup table.
6147       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6148       // accessing the lookup table.
6149       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6150       assert(VM_Version::supports_avx512bw(), "");
6151       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6152       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6153       vpand(xtmp2, dst, src, vec_enc);
6154       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6155       vpsrlw(xtmp3, src, 4, vec_enc);
6156       vpand(xtmp3, dst, xtmp3, vec_enc);
6157       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6158       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6159       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6160       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6161       break;
6162     default:
6163       fatal("Unsupported type %s", type2name(bt));
6164       break;
6165   }
6166 }
6167 
6168 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6169                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6170   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6171   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6172   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6173   // accessing the lookup table.
6174   vpand(dst, xtmp2, src, vec_enc);
6175   vpshufb(dst, xtmp1, dst, vec_enc);
6176   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6177   // accessing the lookup table.
6178   vpsrlw(xtmp3, src, 4, vec_enc);
6179   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6180   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6181   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6182   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6183   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6184   vpaddb(dst, dst, xtmp2, vec_enc);
6185   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6186 }
6187 
6188 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6189                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6190   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6191   // Add zero counts of lower byte and upper byte of a word if
6192   // upper byte holds a zero value.
6193   vpsrlw(xtmp3, src, 8, vec_enc);
6194   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6195   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6196   vpsllw(xtmp2, dst, 8, vec_enc);
6197   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6198   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6199   vpsrlw(dst, dst, 8, vec_enc);
6200 }
6201 
6202 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6203                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6204   // By converting the integer to a float, we can obtain the number of leading zeros based on the exponent of the float.
6205   // As the float exponent contains a bias of 127 for nonzero values, the bias must be removed before interpreting the
6206   // exponent as the leading zero count.
6207 
6208   // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher
6209   // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit
6210   // contributes to the leading number of zeros.
6211   vpsrld(dst, src, 1, vec_enc);
6212   vpandn(dst, dst, src, vec_enc);
6213 
6214   vcvtdq2ps(dst, dst, vec_enc);
6215 
6216   // By comparing the register to itself, all the bits in the destination are set.
6217   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6218 
6219   // Move the biased exponent to the low end of the lane and mask with 0xFF to discard the sign bit.
6220   vpsrld(xtmp2, xtmp1, 24, vec_enc);
6221   vpsrld(dst, dst, 23, vec_enc);
6222   vpand(dst, xtmp2, dst, vec_enc);
6223 
6224   // Subtract 127 from the exponent, which removes the bias from the exponent.
6225   vpsrld(xtmp2, xtmp1, 25, vec_enc);
6226   vpsubd(dst, dst, xtmp2, vec_enc);
6227 
6228   vpsrld(xtmp2, xtmp1, 27, vec_enc);
6229 
6230   // If the original value is 0 the exponent would not have bias, so the subtraction creates a negative number. If this
6231   // is found in any of the lanes, replace the lane with -1 from xtmp1.
6232   vblendvps(dst, dst, xtmp1, dst, vec_enc, true, xtmp3);
6233 
6234   // If the original value is negative, replace the lane with 31.
6235   vblendvps(dst, dst, xtmp2, src, vec_enc, true, xtmp3);
6236 
6237   // Subtract the exponent from 31, giving the final result. For 0, the result is 32 as the exponent was replaced with -1,
6238   // and for negative numbers the result is 0 as the exponent was replaced with 31.
6239   vpsubd(dst, xtmp2, dst, vec_enc);
6240 }
6241 
6242 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6243                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6244   // Find the leading zeros of the top and bottom halves of the long individually.
6245   vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6246 
6247   // Move the top half result to the bottom half of xtmp1, setting the top half to 0.
6248   vpsrlq(xtmp1, dst, 32, vec_enc);
6249   // By moving the top half result to the right by 6 bits, if the top half was empty (i.e. 32 is returned) the result bit will
6250   // be in the most significant position of the bottom half.
6251   vpsrlq(xtmp2, dst, 6, vec_enc);
6252 
6253   // In the bottom half, add the top half and bottom half results.
6254   vpaddq(dst, xtmp1, dst, vec_enc);
6255 
6256   // For the bottom half, choose between the values using the most significant bit of xtmp2.
6257   // If the MSB is set, then bottom+top in dst is the resulting value. If the top half is less than 32 xtmp1 is chosen,
6258   // which contains only the top half result.
6259   // In the top half the MSB is always zero, so the value in xtmp1 is always chosen. This value is always 0, which clears
6260   // the lane as required.
6261   vblendvps(dst, xtmp1, dst, xtmp2, vec_enc, true, xtmp3);
6262 }
6263 
6264 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6265                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6266                                                        Register rtmp, int vec_enc) {
6267   assert(is_integral_type(bt), "unexpected type");
6268   assert(vec_enc < Assembler::AVX_512bit, "");
6269   switch(bt) {
6270     case T_LONG:
6271       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6272       break;
6273     case T_INT:
6274       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6275       break;
6276     case T_SHORT:
6277       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6278       break;
6279     case T_BYTE:
6280       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6281       break;
6282     default:
6283       fatal("Unsupported type %s", type2name(bt));
6284       break;
6285   }
6286 }
6287 
6288 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6289   switch(bt) {
6290     case T_BYTE:
6291       vpsubb(dst, src1, src2, vec_enc);
6292       break;
6293     case T_SHORT:
6294       vpsubw(dst, src1, src2, vec_enc);
6295       break;
6296     case T_INT:
6297       vpsubd(dst, src1, src2, vec_enc);
6298       break;
6299     case T_LONG:
6300       vpsubq(dst, src1, src2, vec_enc);
6301       break;
6302     default:
6303       fatal("Unsupported type %s", type2name(bt));
6304       break;
6305   }
6306 }
6307 
6308 // Trailing zero count computation is based on leading zero count operation as per
6309 // following equation. All AVX3 targets support AVX512CD feature which offers
6310 // direct vector instruction to compute leading zero count.
6311 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6312 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6313                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6314                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6315   assert(is_integral_type(bt), "");
6316   // xtmp = -1
6317   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6318   // xtmp = xtmp + src
6319   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6320   // xtmp = xtmp & ~src
6321   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6322   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6323   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6324   vpsub(bt, dst, xtmp4, dst, vec_enc);
6325 }
6326 
6327 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6328 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6329 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6330                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6331   assert(is_integral_type(bt), "");
6332   // xtmp = 0
6333   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6334   // xtmp = 0 - src
6335   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6336   // xtmp = xtmp | src
6337   vpor(xtmp3, xtmp3, src, vec_enc);
6338   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6339   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6340   vpsub(bt, dst, xtmp1, dst, vec_enc);
6341 }
6342 
6343 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6344   Label done;
6345   Label neg_divisor_fastpath;
6346   cmpl(divisor, 0);
6347   jccb(Assembler::less, neg_divisor_fastpath);
6348   xorl(rdx, rdx);
6349   divl(divisor);
6350   jmpb(done);
6351   bind(neg_divisor_fastpath);
6352   // Fastpath for divisor < 0:
6353   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6354   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6355   movl(rdx, rax);
6356   subl(rdx, divisor);
6357   if (VM_Version::supports_bmi1()) {
6358     andnl(rax, rdx, rax);
6359   } else {
6360     notl(rdx);
6361     andl(rax, rdx);
6362   }
6363   shrl(rax, 31);
6364   bind(done);
6365 }
6366 
6367 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6368   Label done;
6369   Label neg_divisor_fastpath;
6370   cmpl(divisor, 0);
6371   jccb(Assembler::less, neg_divisor_fastpath);
6372   xorl(rdx, rdx);
6373   divl(divisor);
6374   jmpb(done);
6375   bind(neg_divisor_fastpath);
6376   // Fastpath when divisor < 0:
6377   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6378   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6379   movl(rdx, rax);
6380   subl(rax, divisor);
6381   if (VM_Version::supports_bmi1()) {
6382     andnl(rax, rax, rdx);
6383   } else {
6384     notl(rax);
6385     andl(rax, rdx);
6386   }
6387   sarl(rax, 31);
6388   andl(rax, divisor);
6389   subl(rdx, rax);
6390   bind(done);
6391 }
6392 
6393 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6394   Label done;
6395   Label neg_divisor_fastpath;
6396 
6397   cmpl(divisor, 0);
6398   jccb(Assembler::less, neg_divisor_fastpath);
6399   xorl(rdx, rdx);
6400   divl(divisor);
6401   jmpb(done);
6402   bind(neg_divisor_fastpath);
6403   // Fastpath for divisor < 0:
6404   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6405   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6406   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6407   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6408   movl(rdx, rax);
6409   subl(rax, divisor);
6410   if (VM_Version::supports_bmi1()) {
6411     andnl(rax, rax, rdx);
6412   } else {
6413     notl(rax);
6414     andl(rax, rdx);
6415   }
6416   movl(tmp, rax);
6417   shrl(rax, 31); // quotient
6418   sarl(tmp, 31);
6419   andl(tmp, divisor);
6420   subl(rdx, tmp); // remainder
6421   bind(done);
6422 }
6423 
6424 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6425                                  XMMRegister xtmp2, Register rtmp) {
6426   if(VM_Version::supports_gfni()) {
6427     // Galois field instruction based bit reversal based on following algorithm.
6428     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6429     mov64(rtmp, 0x8040201008040201L);
6430     movq(xtmp1, src);
6431     movq(xtmp2, rtmp);
6432     gf2p8affineqb(xtmp1, xtmp2, 0);
6433     movq(dst, xtmp1);
6434   } else {
6435     // Swap even and odd numbered bits.
6436     movl(rtmp, src);
6437     andl(rtmp, 0x55555555);
6438     shll(rtmp, 1);
6439     movl(dst, src);
6440     andl(dst, 0xAAAAAAAA);
6441     shrl(dst, 1);
6442     orl(dst, rtmp);
6443 
6444     // Swap LSB and MSB 2 bits of each nibble.
6445     movl(rtmp, dst);
6446     andl(rtmp, 0x33333333);
6447     shll(rtmp, 2);
6448     andl(dst, 0xCCCCCCCC);
6449     shrl(dst, 2);
6450     orl(dst, rtmp);
6451 
6452     // Swap LSB and MSB 4 bits of each byte.
6453     movl(rtmp, dst);
6454     andl(rtmp, 0x0F0F0F0F);
6455     shll(rtmp, 4);
6456     andl(dst, 0xF0F0F0F0);
6457     shrl(dst, 4);
6458     orl(dst, rtmp);
6459   }
6460   bswapl(dst);
6461 }
6462 
6463 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6464                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6465   if(VM_Version::supports_gfni()) {
6466     // Galois field instruction based bit reversal based on following algorithm.
6467     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6468     mov64(rtmp1, 0x8040201008040201L);
6469     movq(xtmp1, src);
6470     movq(xtmp2, rtmp1);
6471     gf2p8affineqb(xtmp1, xtmp2, 0);
6472     movq(dst, xtmp1);
6473   } else {
6474     // Swap even and odd numbered bits.
6475     movq(rtmp1, src);
6476     mov64(rtmp2, 0x5555555555555555L);
6477     andq(rtmp1, rtmp2);
6478     shlq(rtmp1, 1);
6479     movq(dst, src);
6480     notq(rtmp2);
6481     andq(dst, rtmp2);
6482     shrq(dst, 1);
6483     orq(dst, rtmp1);
6484 
6485     // Swap LSB and MSB 2 bits of each nibble.
6486     movq(rtmp1, dst);
6487     mov64(rtmp2, 0x3333333333333333L);
6488     andq(rtmp1, rtmp2);
6489     shlq(rtmp1, 2);
6490     notq(rtmp2);
6491     andq(dst, rtmp2);
6492     shrq(dst, 2);
6493     orq(dst, rtmp1);
6494 
6495     // Swap LSB and MSB 4 bits of each byte.
6496     movq(rtmp1, dst);
6497     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6498     andq(rtmp1, rtmp2);
6499     shlq(rtmp1, 4);
6500     notq(rtmp2);
6501     andq(dst, rtmp2);
6502     shrq(dst, 4);
6503     orq(dst, rtmp1);
6504   }
6505   bswapq(dst);
6506 }
6507 
6508 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6509   Label done;
6510   Label neg_divisor_fastpath;
6511   cmpq(divisor, 0);
6512   jccb(Assembler::less, neg_divisor_fastpath);
6513   xorl(rdx, rdx);
6514   divq(divisor);
6515   jmpb(done);
6516   bind(neg_divisor_fastpath);
6517   // Fastpath for divisor < 0:
6518   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6519   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6520   movq(rdx, rax);
6521   subq(rdx, divisor);
6522   if (VM_Version::supports_bmi1()) {
6523     andnq(rax, rdx, rax);
6524   } else {
6525     notq(rdx);
6526     andq(rax, rdx);
6527   }
6528   shrq(rax, 63);
6529   bind(done);
6530 }
6531 
6532 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6533   Label done;
6534   Label neg_divisor_fastpath;
6535   cmpq(divisor, 0);
6536   jccb(Assembler::less, neg_divisor_fastpath);
6537   xorq(rdx, rdx);
6538   divq(divisor);
6539   jmp(done);
6540   bind(neg_divisor_fastpath);
6541   // Fastpath when divisor < 0:
6542   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6543   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6544   movq(rdx, rax);
6545   subq(rax, divisor);
6546   if (VM_Version::supports_bmi1()) {
6547     andnq(rax, rax, rdx);
6548   } else {
6549     notq(rax);
6550     andq(rax, rdx);
6551   }
6552   sarq(rax, 63);
6553   andq(rax, divisor);
6554   subq(rdx, rax);
6555   bind(done);
6556 }
6557 
6558 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6559   Label done;
6560   Label neg_divisor_fastpath;
6561   cmpq(divisor, 0);
6562   jccb(Assembler::less, neg_divisor_fastpath);
6563   xorq(rdx, rdx);
6564   divq(divisor);
6565   jmp(done);
6566   bind(neg_divisor_fastpath);
6567   // Fastpath for divisor < 0:
6568   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6569   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6570   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6571   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6572   movq(rdx, rax);
6573   subq(rax, divisor);
6574   if (VM_Version::supports_bmi1()) {
6575     andnq(rax, rax, rdx);
6576   } else {
6577     notq(rax);
6578     andq(rax, rdx);
6579   }
6580   movq(tmp, rax);
6581   shrq(rax, 63); // quotient
6582   sarq(tmp, 63);
6583   andq(tmp, divisor);
6584   subq(rdx, tmp); // remainder
6585   bind(done);
6586 }
6587 
6588 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6589                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6590                                         int vlen_enc) {
6591   assert(VM_Version::supports_avx512bw(), "");
6592   // Byte shuffles are inlane operations and indices are determined using
6593   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6594   // normalized to index range 0-15. This makes sure that all the multiples
6595   // of an index value are placed at same relative position in 128 bit
6596   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6597   // will be 16th element in their respective 128 bit lanes.
6598   movl(rtmp, 16);
6599   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6600 
6601   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6602   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6603   // original shuffle indices and move the shuffled lanes corresponding to true
6604   // mask to destination vector.
6605   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6606   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6607   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6608 
6609   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6610   // and broadcasting second 128 bit lane.
6611   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6612   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6613   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6614   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6615   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6616 
6617   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6618   // and broadcasting third 128 bit lane.
6619   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6620   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6621   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6622   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6623   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6624 
6625   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6626   // and broadcasting third 128 bit lane.
6627   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6628   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6629   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6630   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6631   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6632 }
6633 
6634 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6635                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6636   if (vlen_enc == AVX_128bit) {
6637     vpermilps(dst, src, shuffle, vlen_enc);
6638   } else if (bt == T_INT) {
6639     vpermd(dst, shuffle, src, vlen_enc);
6640   } else {
6641     assert(bt == T_FLOAT, "");
6642     vpermps(dst, shuffle, src, vlen_enc);
6643   }
6644 }
6645 
6646 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6647   switch(opcode) {
6648     case Op_AddHF: vaddsh(dst, src1, src2); break;
6649     case Op_SubHF: vsubsh(dst, src1, src2); break;
6650     case Op_MulHF: vmulsh(dst, src1, src2); break;
6651     case Op_DivHF: vdivsh(dst, src1, src2); break;
6652     default: assert(false, "%s", NodeClassNames[opcode]); break;
6653   }
6654 }
6655 
6656 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6657   switch(elem_bt) {
6658     case T_BYTE:
6659       if (ideal_opc == Op_SaturatingAddV) {
6660         vpaddsb(dst, src1, src2, vlen_enc);
6661       } else {
6662         assert(ideal_opc == Op_SaturatingSubV, "");
6663         vpsubsb(dst, src1, src2, vlen_enc);
6664       }
6665       break;
6666     case T_SHORT:
6667       if (ideal_opc == Op_SaturatingAddV) {
6668         vpaddsw(dst, src1, src2, vlen_enc);
6669       } else {
6670         assert(ideal_opc == Op_SaturatingSubV, "");
6671         vpsubsw(dst, src1, src2, vlen_enc);
6672       }
6673       break;
6674     default:
6675       fatal("Unsupported type %s", type2name(elem_bt));
6676       break;
6677   }
6678 }
6679 
6680 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6681   switch(elem_bt) {
6682     case T_BYTE:
6683       if (ideal_opc == Op_SaturatingAddV) {
6684         vpaddusb(dst, src1, src2, vlen_enc);
6685       } else {
6686         assert(ideal_opc == Op_SaturatingSubV, "");
6687         vpsubusb(dst, src1, src2, vlen_enc);
6688       }
6689       break;
6690     case T_SHORT:
6691       if (ideal_opc == Op_SaturatingAddV) {
6692         vpaddusw(dst, src1, src2, vlen_enc);
6693       } else {
6694         assert(ideal_opc == Op_SaturatingSubV, "");
6695         vpsubusw(dst, src1, src2, vlen_enc);
6696       }
6697       break;
6698     default:
6699       fatal("Unsupported type %s", type2name(elem_bt));
6700       break;
6701   }
6702 }
6703 
6704 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6705                                                               XMMRegister src2, KRegister ktmp, int vlen_enc) {
6706   // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6707   // overflow_mask = Inp1 <u Inp2
6708   evpcmpu(elem_bt, ktmp,  src2, src1, Assembler::lt, vlen_enc);
6709   // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6710   evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6711 }
6712 
6713 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6714                                                               XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6715   // Emulate unsigned comparison using signed comparison
6716   // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6717   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6718   vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6719   vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6720 
6721   vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6722 
6723   // Res = INP1 - INP2 (non-commutative and non-associative)
6724   vpsub(elem_bt, dst, src1, src2, vlen_enc);
6725   // Res = Mask ? Zero : Res
6726   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6727   vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6728 }
6729 
6730 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6731                                                                XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6732   // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6733   // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6734   // Res = Signed Add INP1, INP2
6735   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6736   // T1 = SRC1 | SRC2
6737   vpor(xtmp1, src1, src2, vlen_enc);
6738   // Max_Unsigned = -1
6739   vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6740   // Unsigned compare:  Mask = Res <u T1
6741   evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6742   // res  = Mask ? Max_Unsigned : Res
6743   evpblend(elem_bt, dst, ktmp,  dst, xtmp2, true, vlen_enc);
6744 }
6745 
6746 //
6747 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6748 // unsigned addition operation.
6749 //    overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6750 //
6751 // We empirically determined its semantic equivalence to following reduced expression
6752 //    overflow_mask =  (a + b) <u (a | b)
6753 //
6754 // and also verified it though Alive2 solver.
6755 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6756 //
6757 
6758 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6759                                                               XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6760   // Res = Signed Add INP1, INP2
6761   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6762   // Compute T1 = INP1 | INP2
6763   vpor(xtmp3, src1, src2, vlen_enc);
6764   // T1 = Minimum signed value.
6765   vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6766   // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6767   vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6768   // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6769   vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6770   // Compute overflow detection mask = Res<1> <s T1
6771   if (elem_bt == T_INT) {
6772     vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6773   } else {
6774     assert(elem_bt == T_LONG, "");
6775     vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6776   }
6777   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6778 }
6779 
6780 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6781                                       int vlen_enc, bool xtmp2_hold_M1) {
6782   if (VM_Version::supports_avx512dq()) {
6783     evpmovq2m(ktmp, src, vlen_enc);
6784   } else {
6785     assert(VM_Version::supports_evex(), "");
6786     if (!xtmp2_hold_M1) {
6787       vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6788     }
6789     evpsraq(xtmp1, src, 63, vlen_enc);
6790     evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6791   }
6792 }
6793 
6794 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6795                                       int vlen_enc, bool xtmp2_hold_M1) {
6796   if (VM_Version::supports_avx512dq()) {
6797     evpmovd2m(ktmp, src, vlen_enc);
6798   } else {
6799     assert(VM_Version::supports_evex(), "");
6800     if (!xtmp2_hold_M1) {
6801       vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6802     }
6803     vpsrad(xtmp1, src, 31, vlen_enc);
6804     Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6805   }
6806 }
6807 
6808 
6809 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6810   if (elem_bt == T_LONG) {
6811     if (VM_Version::supports_evex()) {
6812       evpsraq(dst, src, 63, vlen_enc);
6813     } else {
6814       vpsrad(dst, src, 31, vlen_enc);
6815       vpshufd(dst, dst, 0xF5, vlen_enc);
6816     }
6817   } else {
6818     assert(elem_bt == T_INT, "");
6819     vpsrad(dst, src, 31, vlen_enc);
6820   }
6821 }
6822 
6823 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6824   if (compute_allones) {
6825     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6826       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6827     } else {
6828       vpcmpeqq(allones, allones, allones, vlen_enc);
6829     }
6830   }
6831   if (elem_bt == T_LONG) {
6832     vpsrlq(dst, allones, 1, vlen_enc);
6833   } else {
6834     assert(elem_bt == T_INT, "");
6835     vpsrld(dst, allones, 1, vlen_enc);
6836   }
6837 }
6838 
6839 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6840   if (compute_allones) {
6841     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6842       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6843     } else {
6844       vpcmpeqq(allones, allones, allones, vlen_enc);
6845     }
6846   }
6847   if (elem_bt == T_LONG) {
6848     vpsllq(dst, allones, 63, vlen_enc);
6849   } else {
6850     assert(elem_bt == T_INT, "");
6851     vpslld(dst, allones, 31, vlen_enc);
6852   }
6853 }
6854 
6855 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask,  XMMRegister src1, XMMRegister src2,
6856                                 Assembler::ComparisonPredicate cond, int vlen_enc) {
6857   switch(elem_bt) {
6858     case T_LONG:  evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6859     case T_INT:   evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6860     case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6861     case T_BYTE:  evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6862     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6863   }
6864 }
6865 
6866 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6867   switch(elem_bt) {
6868     case  T_LONG:  vpcmpgtq(dst, src1, src2, vlen_enc); break;
6869     case  T_INT:   vpcmpgtd(dst, src1, src2, vlen_enc); break;
6870     case  T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6871     case  T_BYTE:  vpcmpgtb(dst, src1, src2, vlen_enc); break;
6872     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6873   }
6874 }
6875 
6876 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6877                                            XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6878   if (elem_bt == T_LONG) {
6879     evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6880   } else {
6881     assert(elem_bt == T_INT, "");
6882     evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6883   }
6884 }
6885 
6886 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6887                                                          XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6888                                                          KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6889   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6890   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6891   // Overflow detection based on Hacker's delight section 2-13.
6892   if (ideal_opc == Op_SaturatingAddV) {
6893     // res = src1 + src2
6894     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6895     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6896     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6897     vpxor(xtmp1, dst, src1, vlen_enc);
6898     vpxor(xtmp2, dst, src2, vlen_enc);
6899     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6900   } else {
6901     assert(ideal_opc == Op_SaturatingSubV, "");
6902     // res = src1 - src2
6903     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6904     // Overflow occurs when both inputs have opposite polarity and
6905     // result polarity does not comply with first input polarity.
6906     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6907     vpxor(xtmp1, src1, src2, vlen_enc);
6908     vpxor(xtmp2, dst, src1, vlen_enc);
6909     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6910   }
6911 
6912   // Compute overflow detection mask.
6913   evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6914   // Note: xtmp1 hold -1 in all its lanes after above call.
6915 
6916   // Compute mask based on first input polarity.
6917   evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6918 
6919   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6920   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6921 
6922   // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6923   // set bits in first input polarity mask holds a min value.
6924   evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6925   // Blend destination lanes with saturated values using overflow detection mask.
6926   evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6927 }
6928 
6929 
6930 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6931                                                         XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6932                                                         XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6933   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6934   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6935   // Overflow detection based on Hacker's delight section 2-13.
6936   if (ideal_opc == Op_SaturatingAddV) {
6937     // res = src1 + src2
6938     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6939     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6940     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6941     vpxor(xtmp1, dst, src1, vlen_enc);
6942     vpxor(xtmp2, dst, src2, vlen_enc);
6943     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6944   } else {
6945     assert(ideal_opc == Op_SaturatingSubV, "");
6946     // res = src1 - src2
6947     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6948     // Overflow occurs when both inputs have opposite polarity and
6949     // result polarity does not comply with first input polarity.
6950     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6951     vpxor(xtmp1, src1, src2, vlen_enc);
6952     vpxor(xtmp2, dst, src1, vlen_enc);
6953     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6954   }
6955 
6956   // Sign-extend to compute overflow detection mask.
6957   vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
6958 
6959   vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
6960   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
6961   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6962 
6963   // Compose saturating min/max vector using first input polarity mask.
6964   vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
6965   vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
6966 
6967   // Blend result with saturating vector using overflow detection mask.
6968   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6969 }
6970 
6971 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6972   switch(elem_bt) {
6973     case T_BYTE:
6974       if (ideal_opc == Op_SaturatingAddV) {
6975         vpaddsb(dst, src1, src2, vlen_enc);
6976       } else {
6977         assert(ideal_opc == Op_SaturatingSubV, "");
6978         vpsubsb(dst, src1, src2, vlen_enc);
6979       }
6980       break;
6981     case T_SHORT:
6982       if (ideal_opc == Op_SaturatingAddV) {
6983         vpaddsw(dst, src1, src2, vlen_enc);
6984       } else {
6985         assert(ideal_opc == Op_SaturatingSubV, "");
6986         vpsubsw(dst, src1, src2, vlen_enc);
6987       }
6988       break;
6989     default:
6990       fatal("Unsupported type %s", type2name(elem_bt));
6991       break;
6992   }
6993 }
6994 
6995 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6996   switch(elem_bt) {
6997     case T_BYTE:
6998       if (ideal_opc == Op_SaturatingAddV) {
6999         vpaddusb(dst, src1, src2, vlen_enc);
7000       } else {
7001         assert(ideal_opc == Op_SaturatingSubV, "");
7002         vpsubusb(dst, src1, src2, vlen_enc);
7003       }
7004       break;
7005     case T_SHORT:
7006       if (ideal_opc == Op_SaturatingAddV) {
7007         vpaddusw(dst, src1, src2, vlen_enc);
7008       } else {
7009         assert(ideal_opc == Op_SaturatingSubV, "");
7010         vpsubusw(dst, src1, src2, vlen_enc);
7011       }
7012       break;
7013     default:
7014       fatal("Unsupported type %s", type2name(elem_bt));
7015       break;
7016   }
7017 }
7018 
7019 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
7020                                                      XMMRegister src2, int vlen_enc) {
7021   switch(elem_bt) {
7022     case T_BYTE:
7023       evpermi2b(dst, src1, src2, vlen_enc);
7024       break;
7025     case T_SHORT:
7026       evpermi2w(dst, src1, src2, vlen_enc);
7027       break;
7028     case T_INT:
7029       evpermi2d(dst, src1, src2, vlen_enc);
7030       break;
7031     case T_LONG:
7032       evpermi2q(dst, src1, src2, vlen_enc);
7033       break;
7034     case T_FLOAT:
7035       evpermi2ps(dst, src1, src2, vlen_enc);
7036       break;
7037     case T_DOUBLE:
7038       evpermi2pd(dst, src1, src2, vlen_enc);
7039       break;
7040     default:
7041       fatal("Unsupported type %s", type2name(elem_bt));
7042       break;
7043   }
7044 }
7045 
7046 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
7047   if (is_unsigned) {
7048     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7049   } else {
7050     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7051   }
7052 }
7053 
7054 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
7055   if (is_unsigned) {
7056     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7057   } else {
7058     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7059   }
7060 }
7061 
7062 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
7063   switch(opcode) {
7064     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7065     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7066     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7067     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7068     default: assert(false, "%s", NodeClassNames[opcode]); break;
7069   }
7070 }
7071 
7072 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7073   switch(opcode) {
7074     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7075     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7076     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7077     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7078     default: assert(false, "%s", NodeClassNames[opcode]); break;
7079   }
7080 }
7081 
7082 void C2_MacroAssembler::sminmax_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7083                                      KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) {
7084   vminmax_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit);
7085 }
7086 
7087 void C2_MacroAssembler::sminmax_fp16_avx10_2(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7088                                              KRegister ktmp) {
7089   if (opcode == Op_MaxHF) {
7090     // dst = max(src1, src2)
7091     evminmaxsh(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MAX_COMPARE_SIGN);
7092   } else {
7093     assert(opcode == Op_MinHF, "");
7094     // dst = min(src1, src2)
7095     evminmaxsh(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MIN_COMPARE_SIGN);
7096   }
7097 }
7098 
7099 void C2_MacroAssembler::vminmax_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7100                                      KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
7101   if (opcode == Op_MaxVHF || opcode == Op_MaxHF) {
7102     // Move sign bits of src2 to mask register.
7103     evpmovw2m(ktmp, src2, vlen_enc);
7104     // xtmp1 = src2 < 0 ? src2 : src1
7105     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7106     // xtmp2 = src2 < 0 ? ? src1 : src2
7107     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7108     // Idea behind above swapping is to make seconds source operand a +ve value.
7109     // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in
7110     // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction,
7111     // the second source operand, either a NaN or a valid floating-point value, is returned
7112     // dst = max(xtmp1, xtmp2)
7113     evmaxph(dst, xtmp1, xtmp2, vlen_enc);
7114     // isNaN = is_unordered_quiet(xtmp1)
7115     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7116     // Final result is same as first source if its a NaN value,
7117     // in case second operand holds a NaN value then as per above semantics
7118     // result is same as second operand.
7119     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7120   } else {
7121     assert(opcode == Op_MinVHF || opcode == Op_MinHF, "");
7122     // Move sign bits of src1 to mask register.
7123     evpmovw2m(ktmp, src1, vlen_enc);
7124     // xtmp1 = src1 < 0 ? src2 : src1
7125     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7126     // xtmp2 = src1 < 0 ? src1 : src2
7127     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7128     // Idea behind above swapping is to make seconds source operand a -ve value.
7129     // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in
7130     // the second source operand is returned.
7131     // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN
7132     // or a valid floating-point value, is written to the result.
7133     // dst = min(xtmp1, xtmp2)
7134     evminph(dst, xtmp1, xtmp2, vlen_enc);
7135     // isNaN = is_unordered_quiet(xtmp1)
7136     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7137     // Final result is same as first source if its a NaN value,
7138     // in case second operand holds a NaN value then as per above semantics
7139     // result is same as second operand.
7140     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7141   }
7142 }
7143 
7144 void C2_MacroAssembler::vminmax_fp16_avx10_2(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7145                                              KRegister ktmp, int vlen_enc) {
7146   if (opcode == Op_MaxVHF) {
7147     // dst = max(src1, src2)
7148     evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vlen_enc);
7149   } else {
7150     assert(opcode == Op_MinVHF, "");
7151     // dst = min(src1, src2)
7152     evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vlen_enc);
7153   }
7154 }
7155 
7156 void C2_MacroAssembler::vminmax_fp16_avx10_2(int opcode, XMMRegister dst, XMMRegister src1, Address src2,
7157                                              KRegister ktmp, int vlen_enc) {
7158   if (opcode == Op_MaxVHF) {
7159     // dst = max(src1, src2)
7160     evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vlen_enc);
7161   } else {
7162     assert(opcode == Op_MinVHF, "");
7163     // dst = min(src1, src2)
7164     evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vlen_enc);
7165   }
7166 }