1 /*
   2  * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "gc/shared/barrierSet.hpp"
  28 #include "gc/shared/barrierSetAssembler.hpp"
  29 #include "oops/methodData.hpp"
  30 #include "opto/c2_MacroAssembler.hpp"
  31 #include "opto/intrinsicnode.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/opcodes.hpp"
  34 #include "opto/subnode.hpp"
  35 #include "runtime/globals.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 #include "utilities/checkedCast.hpp"
  39 #include "utilities/globalDefinitions.hpp"
  40 #include "utilities/powerOfTwo.hpp"
  41 #include "utilities/sizes.hpp"
  42 
  43 #ifdef PRODUCT
  44 #define BLOCK_COMMENT(str) /* nothing */
  45 #define STOP(error) stop(error)
  46 #else
  47 #define BLOCK_COMMENT(str) block_comment(str)
  48 #define STOP(error) block_comment(error); stop(error)
  49 #endif
  50 
  51 // C2 compiled method's prolog code.
  52 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
  53   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  54 
  55   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  56   // Remove word for return addr
  57   framesize -= wordSize;
  58   stack_bang_size -= wordSize;
  59 
  60   // Calls to C2R adapters often do not accept exceptional returns.
  61   // We require that their callers must bang for them.  But be careful, because
  62   // some VM calls (such as call site linkage) can use several kilobytes of
  63   // stack.  But the stack safety zone should account for that.
  64   // See bugs 4446381, 4468289, 4497237.
  65   if (stack_bang_size > 0) {
  66     generate_stack_overflow_check(stack_bang_size);
  67 
  68     // We always push rbp, so that on return to interpreter rbp, will be
  69     // restored correctly and we can correct the stack.
  70     push(rbp);
  71     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  72     if (PreserveFramePointer) {
  73       mov(rbp, rsp);
  74     }
  75     // Remove word for ebp
  76     framesize -= wordSize;
  77 
  78     // Create frame
  79     if (framesize) {
  80       subptr(rsp, framesize);
  81     }
  82   } else {
  83     subptr(rsp, framesize);
  84 
  85     // Save RBP register now.
  86     framesize -= wordSize;
  87     movptr(Address(rsp, framesize), rbp);
  88     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  89     if (PreserveFramePointer) {
  90       movptr(rbp, rsp);
  91       if (framesize > 0) {
  92         addptr(rbp, framesize);
  93       }
  94     }
  95   }
  96 
  97   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
  98     framesize -= wordSize;
  99     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 100   }
 101 
 102 #ifdef ASSERT
 103   if (VerifyStackAtCalls) {
 104     Label L;
 105     push(rax);
 106     mov(rax, rsp);
 107     andptr(rax, StackAlignmentInBytes-1);
 108     cmpptr(rax, StackAlignmentInBytes-wordSize);
 109     pop(rax);
 110     jcc(Assembler::equal, L);
 111     STOP("Stack is not properly aligned!");
 112     bind(L);
 113   }
 114 #endif
 115 
 116   if (!is_stub) {
 117     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 118     // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 119     Label dummy_slow_path;
 120     Label dummy_continuation;
 121     Label* slow_path = &dummy_slow_path;
 122     Label* continuation = &dummy_continuation;
 123     if (!Compile::current()->output()->in_scratch_emit_size()) {
 124       // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 125       C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 126       Compile::current()->output()->add_stub(stub);
 127       slow_path = &stub->entry();
 128       continuation = &stub->continuation();
 129     }
 130     bs->nmethod_entry_barrier(this, slow_path, continuation);
 131   }
 132 }
 133 
 134 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 135   switch (vlen_in_bytes) {
 136     case  4: // fall-through
 137     case  8: // fall-through
 138     case 16: return Assembler::AVX_128bit;
 139     case 32: return Assembler::AVX_256bit;
 140     case 64: return Assembler::AVX_512bit;
 141 
 142     default: {
 143       ShouldNotReachHere();
 144       return Assembler::AVX_NoVec;
 145     }
 146   }
 147 }
 148 
 149 // fast_lock and fast_unlock used by C2
 150 
 151 // Because the transitions from emitted code to the runtime
 152 // monitorenter/exit helper stubs are so slow it's critical that
 153 // we inline both the stack-locking fast path and the inflated fast path.
 154 //
 155 // See also: cmpFastLock and cmpFastUnlock.
 156 //
 157 // What follows is a specialized inline transliteration of the code
 158 // in enter() and exit(). If we're concerned about I$ bloat another
 159 // option would be to emit TrySlowEnter and TrySlowExit methods
 160 // at startup-time.  These methods would accept arguments as
 161 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 162 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 163 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 164 // In practice, however, the # of lock sites is bounded and is usually small.
 165 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 166 // if the processor uses simple bimodal branch predictors keyed by EIP
 167 // Since the helper routines would be called from multiple synchronization
 168 // sites.
 169 //
 170 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 171 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 172 // to those specialized methods.  That'd give us a mostly platform-independent
 173 // implementation that the JITs could optimize and inline at their pleasure.
 174 // Done correctly, the only time we'd need to cross to native could would be
 175 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 176 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 177 // (b) explicit barriers or fence operations.
 178 //
 179 // TODO:
 180 //
 181 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 182 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 183 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 184 //    the lock operators would typically be faster than reifying Self.
 185 //
 186 // *  Ideally I'd define the primitives as:
 187 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 188 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 189 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 190 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 191 //    Furthermore the register assignments are overconstrained, possibly resulting in
 192 //    sub-optimal code near the synchronization site.
 193 //
 194 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 195 //    Alternately, use a better sp-proximity test.
 196 //
 197 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 198 //    Either one is sufficient to uniquely identify a thread.
 199 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 200 //
 201 // *  Intrinsify notify() and notifyAll() for the common cases where the
 202 //    object is locked by the calling thread but the waitlist is empty.
 203 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 204 //
 205 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 206 //    But beware of excessive branch density on AMD Opterons.
 207 //
 208 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 209 //    or failure of the fast path.  If the fast path fails then we pass
 210 //    control to the slow path, typically in C.  In fast_lock and
 211 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 212 //    will emit a conditional branch immediately after the node.
 213 //    So we have branches to branches and lots of ICC.ZF games.
 214 //    Instead, it might be better to have C2 pass a "FailureLabel"
 215 //    into fast_lock and fast_unlock.  In the case of success, control
 216 //    will drop through the node.  ICC.ZF is undefined at exit.
 217 //    In the case of failure, the node will branch directly to the
 218 //    FailureLabel
 219 
 220 
 221 // obj: object to lock
 222 // box: on-stack box address -- KILLED
 223 // rax: tmp -- KILLED
 224 // t  : tmp -- KILLED
 225 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
 226                                               Register t, Register thread) {
 227   assert(rax_reg == rax, "Used for CAS");
 228   assert_different_registers(obj, box, rax_reg, t, thread);
 229 
 230   // Handle inflated monitor.
 231   Label inflated;
 232   // Finish fast lock successfully. ZF value is irrelevant.
 233   Label locked;
 234   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 235   Label slow_path;
 236 
 237   if (UseObjectMonitorTable) {
 238     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 239     movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
 240   }
 241 
 242   if (DiagnoseSyncOnValueBasedClasses != 0) {
 243     load_klass(rax_reg, obj, t);
 244     testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 245     jcc(Assembler::notZero, slow_path);
 246   }
 247 
 248   const Register mark = t;
 249 
 250   { // Lightweight Lock
 251 
 252     Label push;
 253 
 254     const Register top = UseObjectMonitorTable ? rax_reg : box;
 255 
 256     // Load the mark.
 257     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 258 
 259     // Prefetch top.
 260     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 261 
 262     // Check for monitor (0b10).
 263     testptr(mark, markWord::monitor_value);
 264     jcc(Assembler::notZero, inflated);
 265 
 266     // Check if lock-stack is full.
 267     cmpl(top, LockStack::end_offset() - 1);
 268     jcc(Assembler::greater, slow_path);
 269 
 270     // Check if recursive.
 271     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 272     jccb(Assembler::equal, push);
 273 
 274     // Try to lock. Transition lock bits 0b01 => 0b00
 275     movptr(rax_reg, mark);
 276     orptr(rax_reg, markWord::unlocked_value);
 277     andptr(mark, ~(int32_t)markWord::unlocked_value);
 278     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 279     jcc(Assembler::notEqual, slow_path);
 280 
 281     if (UseObjectMonitorTable) {
 282       // Need to reload top, clobbered by CAS.
 283       movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 284     }
 285     bind(push);
 286     // After successful lock, push object on lock-stack.
 287     movptr(Address(thread, top), obj);
 288     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 289     jmpb(locked);
 290   }
 291 
 292   { // Handle inflated monitor.
 293     bind(inflated);
 294 
 295     const Register monitor = t;
 296 
 297     if (!UseObjectMonitorTable) {
 298       assert(mark == monitor, "should be the same here");
 299     } else {
 300       // Uses ObjectMonitorTable.  Look for the monitor in the om_cache.
 301       // Fetch ObjectMonitor* from the cache or take the slow-path.
 302       Label monitor_found;
 303 
 304       // Load cache address
 305       lea(t, Address(thread, JavaThread::om_cache_oops_offset()));
 306 
 307       const int num_unrolled = 2;
 308       for (int i = 0; i < num_unrolled; i++) {
 309         cmpptr(obj, Address(t));
 310         jccb(Assembler::equal, monitor_found);
 311         increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 312       }
 313 
 314       Label loop;
 315 
 316       // Search for obj in cache.
 317       bind(loop);
 318 
 319       // Check for match.
 320       cmpptr(obj, Address(t));
 321       jccb(Assembler::equal, monitor_found);
 322 
 323       // Search until null encountered, guaranteed _null_sentinel at end.
 324       cmpptr(Address(t), 1);
 325       jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0
 326       increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 327       jmpb(loop);
 328 
 329       // Cache hit.
 330       bind(monitor_found);
 331       movptr(monitor, Address(t, OMCache::oop_to_monitor_difference()));
 332     }
 333     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 334     const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 335     const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
 336 
 337     Label monitor_locked;
 338     // Lock the monitor.
 339 
 340     if (UseObjectMonitorTable) {
 341       // Cache the monitor for unlock before trashing box. On failure to acquire
 342       // the lock, the slow path will reset the entry accordingly (see CacheSetter).
 343       movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
 344     }
 345 
 346     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 347     xorptr(rax_reg, rax_reg);
 348     movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
 349     lock(); cmpxchgptr(box, owner_address);
 350     jccb(Assembler::equal, monitor_locked);
 351 
 352     // Check if recursive.
 353     cmpptr(box, rax_reg);
 354     jccb(Assembler::notEqual, slow_path);
 355 
 356     // Recursive.
 357     increment(recursions_address);
 358 
 359     bind(monitor_locked);
 360   }
 361 
 362   bind(locked);
 363   // Set ZF = 1
 364   xorl(rax_reg, rax_reg);
 365 
 366 #ifdef ASSERT
 367   // Check that locked label is reached with ZF set.
 368   Label zf_correct;
 369   Label zf_bad_zero;
 370   jcc(Assembler::zero, zf_correct);
 371   jmp(zf_bad_zero);
 372 #endif
 373 
 374   bind(slow_path);
 375 #ifdef ASSERT
 376   // Check that slow_path label is reached with ZF not set.
 377   jcc(Assembler::notZero, zf_correct);
 378   stop("Fast Lock ZF != 0");
 379   bind(zf_bad_zero);
 380   stop("Fast Lock ZF != 1");
 381   bind(zf_correct);
 382 #endif
 383   // C2 uses the value of ZF to determine the continuation.
 384 }
 385 
 386 // obj: object to lock
 387 // rax: tmp -- KILLED
 388 // t  : tmp - cannot be obj nor rax -- KILLED
 389 //
 390 // Some commentary on balanced locking:
 391 //
 392 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 393 // Methods that don't have provably balanced locking are forced to run in the
 394 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 395 // The interpreter provides two properties:
 396 // I1:  At return-time the interpreter automatically and quietly unlocks any
 397 //      objects acquired in the current activation (frame).  Recall that the
 398 //      interpreter maintains an on-stack list of locks currently held by
 399 //      a frame.
 400 // I2:  If a method attempts to unlock an object that is not held by the
 401 //      frame the interpreter throws IMSX.
 402 //
 403 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 404 // B() doesn't have provably balanced locking so it runs in the interpreter.
 405 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 406 // is still locked by A().
 407 //
 408 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface
 409 // Specification" states that an object locked by JNI's MonitorEnter should not be
 410 // unlocked by "normal" java-level locking and vice-versa.  The specification doesn't
 411 // specify what will occur if a program engages in such mixed-mode locking, however.
 412 // Arguably given that the spec legislates the JNI case as undefined our implementation
 413 // could reasonably *avoid* checking owner in fast_unlock().
 414 // In the interest of performance we elide m->Owner==Self check in unlock.
 415 // A perfectly viable alternative is to elide the owner check except when
 416 // Xcheck:jni is enabled.
 417 
 418 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) {
 419   assert(reg_rax == rax, "Used for CAS");
 420   assert_different_registers(obj, reg_rax, t);
 421 
 422   // Handle inflated monitor.
 423   Label inflated, inflated_check_lock_stack;
 424   // Finish fast unlock successfully.  MUST jump with ZF == 1
 425   Label unlocked, slow_path;
 426 
 427   const Register mark = t;
 428   const Register monitor = t;
 429   const Register top = UseObjectMonitorTable ? t : reg_rax;
 430   const Register box = reg_rax;
 431 
 432   Label dummy;
 433   C2FastUnlockLightweightStub* stub = nullptr;
 434 
 435   if (!Compile::current()->output()->in_scratch_emit_size()) {
 436     stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread);
 437     Compile::current()->output()->add_stub(stub);
 438   }
 439 
 440   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 441 
 442   { // Lightweight Unlock
 443 
 444     // Load top.
 445     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 446 
 447     if (!UseObjectMonitorTable) {
 448       // Prefetch mark.
 449       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 450     }
 451 
 452     // Check if obj is top of lock-stack.
 453     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 454     // Top of lock stack was not obj. Must be monitor.
 455     jcc(Assembler::notEqual, inflated_check_lock_stack);
 456 
 457     // Pop lock-stack.
 458     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 459     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 460 
 461     // Check if recursive.
 462     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 463     jcc(Assembler::equal, unlocked);
 464 
 465     // We elide the monitor check, let the CAS fail instead.
 466 
 467     if (UseObjectMonitorTable) {
 468       // Load mark.
 469       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 470     }
 471 
 472     // Try to unlock. Transition lock bits 0b00 => 0b01
 473     movptr(reg_rax, mark);
 474     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
 475     orptr(mark, markWord::unlocked_value);
 476     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 477     jcc(Assembler::notEqual, push_and_slow_path);
 478     jmp(unlocked);
 479   }
 480 
 481 
 482   { // Handle inflated monitor.
 483     bind(inflated_check_lock_stack);
 484 #ifdef ASSERT
 485     Label check_done;
 486     subl(top, oopSize);
 487     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
 488     jcc(Assembler::below, check_done);
 489     cmpptr(obj, Address(thread, top));
 490     jccb(Assembler::notEqual, inflated_check_lock_stack);
 491     stop("Fast Unlock lock on stack");
 492     bind(check_done);
 493     if (UseObjectMonitorTable) {
 494       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 495     }
 496     testptr(mark, markWord::monitor_value);
 497     jccb(Assembler::notZero, inflated);
 498     stop("Fast Unlock not monitor");
 499 #endif
 500 
 501     bind(inflated);
 502 
 503     if (!UseObjectMonitorTable) {
 504       assert(mark == monitor, "should be the same here");
 505     } else {
 506       // Uses ObjectMonitorTable.  Look for the monitor in our BasicLock on the stack.
 507       movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 508       // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
 509       cmpptr(monitor, alignof(ObjectMonitor*));
 510       jcc(Assembler::below, slow_path);
 511     }
 512     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 513     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
 514     const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
 515     const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag};
 516     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
 517 
 518     Label recursive;
 519 
 520     // Check if recursive.
 521     cmpptr(recursions_address, 0);
 522     jccb(Assembler::notZero, recursive);
 523 
 524     // Set owner to null.
 525     // Release to satisfy the JMM
 526     movptr(owner_address, NULL_WORD);
 527     // We need a full fence after clearing owner to avoid stranding.
 528     // StoreLoad achieves this.
 529     membar(StoreLoad);
 530 
 531     // Check if the entry_list is empty.
 532     cmpptr(entry_list_address, NULL_WORD);
 533     jccb(Assembler::zero, unlocked);    // If so we are done.
 534 
 535     // Check if there is a successor.
 536     cmpptr(succ_address, NULL_WORD);
 537     jccb(Assembler::notZero, unlocked); // If so we are done.
 538 
 539     // Save the monitor pointer in the current thread, so we can try to
 540     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 541     if (!UseObjectMonitorTable) {
 542       andptr(monitor, ~(int32_t)markWord::monitor_value);
 543     }
 544     movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
 545 
 546     orl(t, 1); // Fast Unlock ZF = 0
 547     jmpb(slow_path);
 548 
 549     // Recursive unlock.
 550     bind(recursive);
 551     decrement(recursions_address);
 552   }
 553 
 554   bind(unlocked);
 555   xorl(t, t); // Fast Unlock ZF = 1
 556 
 557 #ifdef ASSERT
 558   // Check that unlocked label is reached with ZF set.
 559   Label zf_correct;
 560   Label zf_bad_zero;
 561   jcc(Assembler::zero, zf_correct);
 562   jmp(zf_bad_zero);
 563 #endif
 564 
 565   bind(slow_path);
 566   if (stub != nullptr) {
 567     bind(stub->slow_path_continuation());
 568   }
 569 #ifdef ASSERT
 570   // Check that stub->continuation() label is reached with ZF not set.
 571   jcc(Assembler::notZero, zf_correct);
 572   stop("Fast Unlock ZF != 0");
 573   bind(zf_bad_zero);
 574   stop("Fast Unlock ZF != 1");
 575   bind(zf_correct);
 576 #endif
 577   // C2 uses the value of ZF to determine the continuation.
 578 }
 579 
 580 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
 581   fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
 582 }
 583 
 584 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) {
 585   const int framesize = Compile::current()->output()->frame_size_in_bytes();
 586   masm->movptr(dst, rsp);
 587   if (framesize > 2 * wordSize) {
 588     masm->addptr(dst, framesize - 2 * wordSize);
 589   }
 590 }
 591 
 592 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
 593   if (PreserveFramePointer) {
 594     // frame pointer is valid
 595 #ifdef ASSERT
 596     // Verify frame pointer value in rbp.
 597     reconstruct_frame_pointer_helper(this, rtmp);
 598     Label L_success;
 599     cmpq(rbp, rtmp);
 600     jccb(Assembler::equal, L_success);
 601     STOP("frame pointer mismatch");
 602     bind(L_success);
 603 #endif // ASSERT
 604   } else {
 605     reconstruct_frame_pointer_helper(this, rbp);
 606   }
 607 }
 608 
 609 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) {
 610   jint lo = t->_lo;
 611   jint hi = t->_hi;
 612   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi);
 613   if (t == TypeInt::INT) {
 614     return;
 615   }
 616 
 617   BLOCK_COMMENT("CastII {");
 618   Label fail;
 619   Label succeed;
 620   if (hi == max_jint) {
 621     cmpl(val, lo);
 622     jccb(Assembler::greaterEqual, succeed);
 623   } else {
 624     if (lo != min_jint) {
 625       cmpl(val, lo);
 626       jccb(Assembler::less, fail);
 627     }
 628     cmpl(val, hi);
 629     jccb(Assembler::lessEqual, succeed);
 630   }
 631 
 632   bind(fail);
 633   movl(c_rarg0, idx);
 634   movl(c_rarg1, val);
 635   movl(c_rarg2, lo);
 636   movl(c_rarg3, hi);
 637   reconstruct_frame_pointer(rscratch1);
 638   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range)));
 639   hlt();
 640   bind(succeed);
 641   BLOCK_COMMENT("} // CastII");
 642 }
 643 
 644 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
 645   fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
 646 }
 647 
 648 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) {
 649   jlong lo = t->_lo;
 650   jlong hi = t->_hi;
 651   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi);
 652   if (t == TypeLong::LONG) {
 653     return;
 654   }
 655 
 656   BLOCK_COMMENT("CastLL {");
 657   Label fail;
 658   Label succeed;
 659 
 660   auto cmp_val = [&](jlong bound) {
 661     if (is_simm32(bound)) {
 662       cmpq(val, checked_cast<int>(bound));
 663     } else {
 664       mov64(tmp, bound);
 665       cmpq(val, tmp);
 666     }
 667   };
 668 
 669   if (hi == max_jlong) {
 670     cmp_val(lo);
 671     jccb(Assembler::greaterEqual, succeed);
 672   } else {
 673     if (lo != min_jlong) {
 674       cmp_val(lo);
 675       jccb(Assembler::less, fail);
 676     }
 677     cmp_val(hi);
 678     jccb(Assembler::lessEqual, succeed);
 679   }
 680 
 681   bind(fail);
 682   movl(c_rarg0, idx);
 683   movq(c_rarg1, val);
 684   mov64(c_rarg2, lo);
 685   mov64(c_rarg3, hi);
 686   reconstruct_frame_pointer(rscratch1);
 687   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range)));
 688   hlt();
 689   bind(succeed);
 690   BLOCK_COMMENT("} // CastLL");
 691 }
 692 
 693 //-------------------------------------------------------------------------------------------
 694 // Generic instructions support for use in .ad files C2 code generation
 695 
 696 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 697   if (dst != src) {
 698     movdqu(dst, src);
 699   }
 700   if (opcode == Op_AbsVD) {
 701     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 702   } else {
 703     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 704     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 705   }
 706 }
 707 
 708 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 709   if (opcode == Op_AbsVD) {
 710     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 711   } else {
 712     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 713     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 714   }
 715 }
 716 
 717 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 718   if (dst != src) {
 719     movdqu(dst, src);
 720   }
 721   if (opcode == Op_AbsVF) {
 722     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 723   } else {
 724     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 725     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 726   }
 727 }
 728 
 729 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 730   if (opcode == Op_AbsVF) {
 731     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 732   } else {
 733     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 734     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 735   }
 736 }
 737 
 738 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 739   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 740   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 741 
 742   if (opcode == Op_MinV) {
 743     if (elem_bt == T_BYTE) {
 744       pminsb(dst, src);
 745     } else if (elem_bt == T_SHORT) {
 746       pminsw(dst, src);
 747     } else if (elem_bt == T_INT) {
 748       pminsd(dst, src);
 749     } else {
 750       assert(elem_bt == T_LONG, "required");
 751       assert(tmp == xmm0, "required");
 752       assert_different_registers(dst, src, tmp);
 753       movdqu(xmm0, dst);
 754       pcmpgtq(xmm0, src);
 755       blendvpd(dst, src);  // xmm0 as mask
 756     }
 757   } else { // opcode == Op_MaxV
 758     if (elem_bt == T_BYTE) {
 759       pmaxsb(dst, src);
 760     } else if (elem_bt == T_SHORT) {
 761       pmaxsw(dst, src);
 762     } else if (elem_bt == T_INT) {
 763       pmaxsd(dst, src);
 764     } else {
 765       assert(elem_bt == T_LONG, "required");
 766       assert(tmp == xmm0, "required");
 767       assert_different_registers(dst, src, tmp);
 768       movdqu(xmm0, src);
 769       pcmpgtq(xmm0, dst);
 770       blendvpd(dst, src);  // xmm0 as mask
 771     }
 772   }
 773 }
 774 
 775 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 776                                   XMMRegister src1, Address src2, int vlen_enc) {
 777   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 778   if (opcode == Op_UMinV) {
 779     switch(elem_bt) {
 780       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 781       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 782       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 783       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 784       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 785     }
 786   } else {
 787     assert(opcode == Op_UMaxV, "required");
 788     switch(elem_bt) {
 789       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 790       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 791       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 792       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 793       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 794     }
 795   }
 796 }
 797 
 798 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
 799   // For optimality, leverage a full vector width of 512 bits
 800   // for operations over smaller vector sizes on AVX512 targets.
 801   if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 802     if (opcode == Op_UMaxV) {
 803       evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 804     } else {
 805       assert(opcode == Op_UMinV, "required");
 806       evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 807     }
 808   } else {
 809     // T1 = -1
 810     vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
 811     // T1 = -1 << 63
 812     vpsllq(xtmp1, xtmp1, 63, vlen_enc);
 813     // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
 814     vpaddq(xtmp2, xtmp1, src2, vlen_enc);
 815     // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
 816     vpaddq(xtmp1, xtmp1, src1, vlen_enc);
 817     // Mask = T2 > T1
 818     vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
 819     if (opcode == Op_UMaxV) {
 820       // Res = Mask ? Src2 : Src1
 821       vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
 822     } else {
 823       // Res = Mask ? Src1 : Src2
 824       vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
 825     }
 826   }
 827 }
 828 
 829 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 830                                   XMMRegister src1, XMMRegister src2, int vlen_enc) {
 831   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 832   if (opcode == Op_UMinV) {
 833     switch(elem_bt) {
 834       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 835       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 836       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 837       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 838       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 839     }
 840   } else {
 841     assert(opcode == Op_UMaxV, "required");
 842     switch(elem_bt) {
 843       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 844       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 845       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 846       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 847       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 848     }
 849   }
 850 }
 851 
 852 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 853                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 854                                  int vlen_enc) {
 855   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 856 
 857   if (opcode == Op_MinV) {
 858     if (elem_bt == T_BYTE) {
 859       vpminsb(dst, src1, src2, vlen_enc);
 860     } else if (elem_bt == T_SHORT) {
 861       vpminsw(dst, src1, src2, vlen_enc);
 862     } else if (elem_bt == T_INT) {
 863       vpminsd(dst, src1, src2, vlen_enc);
 864     } else {
 865       assert(elem_bt == T_LONG, "required");
 866       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 867         vpminsq(dst, src1, src2, vlen_enc);
 868       } else {
 869         assert_different_registers(dst, src1, src2);
 870         vpcmpgtq(dst, src1, src2, vlen_enc);
 871         vblendvpd(dst, src1, src2, dst, vlen_enc);
 872       }
 873     }
 874   } else { // opcode == Op_MaxV
 875     if (elem_bt == T_BYTE) {
 876       vpmaxsb(dst, src1, src2, vlen_enc);
 877     } else if (elem_bt == T_SHORT) {
 878       vpmaxsw(dst, src1, src2, vlen_enc);
 879     } else if (elem_bt == T_INT) {
 880       vpmaxsd(dst, src1, src2, vlen_enc);
 881     } else {
 882       assert(elem_bt == T_LONG, "required");
 883       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 884         vpmaxsq(dst, src1, src2, vlen_enc);
 885       } else {
 886         assert_different_registers(dst, src1, src2);
 887         vpcmpgtq(dst, src1, src2, vlen_enc);
 888         vblendvpd(dst, src2, src1, dst, vlen_enc);
 889       }
 890     }
 891   }
 892 }
 893 
 894 // Float/Double min max
 895 
 896 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
 897                                    XMMRegister dst, XMMRegister a, XMMRegister b,
 898                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
 899                                    int vlen_enc) {
 900   assert(UseAVX > 0, "required");
 901   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 902          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 903   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 904   assert_different_registers(a, tmp, atmp, btmp);
 905   assert_different_registers(b, tmp, atmp, btmp);
 906 
 907   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 908   bool is_double_word = is_double_word_type(elem_bt);
 909 
 910   /* Note on 'non-obvious' assembly sequence:
 911    *
 912    * While there are vminps/vmaxps instructions, there are two important differences between hardware
 913    * and Java on how they handle floats:
 914    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
 915    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
 916    *
 917    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
 918    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
 919    *                (only useful when signs differ, noop otherwise)
 920    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
 921 
 922    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
 923    *   btmp = (b < +0.0) ? a : b
 924    *   atmp = (b < +0.0) ? b : a
 925    *   Tmp  = Max_Float(atmp , btmp)
 926    *   Res  = (atmp == NaN) ? atmp : Tmp
 927    */
 928 
 929   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
 930   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
 931   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
 932   XMMRegister mask;
 933 
 934   if (!is_double_word && is_min) {
 935     mask = a;
 936     vblend = &MacroAssembler::vblendvps;
 937     vmaxmin = &MacroAssembler::vminps;
 938     vcmp = &MacroAssembler::vcmpps;
 939   } else if (!is_double_word && !is_min) {
 940     mask = b;
 941     vblend = &MacroAssembler::vblendvps;
 942     vmaxmin = &MacroAssembler::vmaxps;
 943     vcmp = &MacroAssembler::vcmpps;
 944   } else if (is_double_word && is_min) {
 945     mask = a;
 946     vblend = &MacroAssembler::vblendvpd;
 947     vmaxmin = &MacroAssembler::vminpd;
 948     vcmp = &MacroAssembler::vcmppd;
 949   } else {
 950     assert(is_double_word && !is_min, "sanity");
 951     mask = b;
 952     vblend = &MacroAssembler::vblendvpd;
 953     vmaxmin = &MacroAssembler::vmaxpd;
 954     vcmp = &MacroAssembler::vcmppd;
 955   }
 956 
 957   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
 958   XMMRegister maxmin, scratch;
 959   if (dst == btmp) {
 960     maxmin = btmp;
 961     scratch = tmp;
 962   } else {
 963     maxmin = tmp;
 964     scratch = btmp;
 965   }
 966 
 967   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
 968   if (precompute_mask && !is_double_word) {
 969     vpsrad(tmp, mask, 32, vlen_enc);
 970     mask = tmp;
 971   } else if (precompute_mask && is_double_word) {
 972     vpxor(tmp, tmp, tmp, vlen_enc);
 973     vpcmpgtq(tmp, tmp, mask, vlen_enc);
 974     mask = tmp;
 975   }
 976 
 977   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
 978   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
 979   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
 980   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 981   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
 982 }
 983 
 984 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
 985                                     XMMRegister dst, XMMRegister a, XMMRegister b,
 986                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
 987                                     int vlen_enc) {
 988   assert(UseAVX > 2, "required");
 989   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 990          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 991   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 992   assert_different_registers(dst, a, atmp, btmp);
 993   assert_different_registers(dst, b, atmp, btmp);
 994 
 995   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 996   bool is_double_word = is_double_word_type(elem_bt);
 997   bool merge = true;
 998 
 999   if (!is_double_word && is_min) {
1000     evpmovd2m(ktmp, a, vlen_enc);
1001     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1002     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1003     vminps(dst, atmp, btmp, vlen_enc);
1004     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1005     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1006   } else if (!is_double_word && !is_min) {
1007     evpmovd2m(ktmp, b, vlen_enc);
1008     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1009     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1010     vmaxps(dst, atmp, btmp, vlen_enc);
1011     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1012     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1013   } else if (is_double_word && is_min) {
1014     evpmovq2m(ktmp, a, vlen_enc);
1015     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1016     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1017     vminpd(dst, atmp, btmp, vlen_enc);
1018     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1019     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1020   } else {
1021     assert(is_double_word && !is_min, "sanity");
1022     evpmovq2m(ktmp, b, vlen_enc);
1023     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1024     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1025     vmaxpd(dst, atmp, btmp, vlen_enc);
1026     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1027     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1028   }
1029 }
1030 
1031 void C2_MacroAssembler::vminmax_fp(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1032                                    XMMRegister src1, XMMRegister src2, int vlen_enc) {
1033   assert(opc == Op_MinV || opc == Op_MinReductionV ||
1034          opc == Op_MaxV || opc == Op_MaxReductionV, "sanity");
1035 
1036   int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_MINMAX_MIN_COMPARE_SIGN
1037                                                          : AVX10_MINMAX_MAX_COMPARE_SIGN;
1038   if (elem_bt == T_FLOAT) {
1039     evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc);
1040   } else {
1041     assert(elem_bt == T_DOUBLE, "");
1042     evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc);
1043   }
1044 }
1045 
1046 // Float/Double signum
1047 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1048   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1049 
1050   Label DONE_LABEL;
1051 
1052   if (opcode == Op_SignumF) {
1053     ucomiss(dst, zero);
1054     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1055     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1056     movflt(dst, one);
1057     jcc(Assembler::above, DONE_LABEL);
1058     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1059   } else if (opcode == Op_SignumD) {
1060     ucomisd(dst, zero);
1061     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1062     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1063     movdbl(dst, one);
1064     jcc(Assembler::above, DONE_LABEL);
1065     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1066   }
1067 
1068   bind(DONE_LABEL);
1069 }
1070 
1071 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1072   if (sign) {
1073     pmovsxbw(dst, src);
1074   } else {
1075     pmovzxbw(dst, src);
1076   }
1077 }
1078 
1079 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1080   if (sign) {
1081     vpmovsxbw(dst, src, vector_len);
1082   } else {
1083     vpmovzxbw(dst, src, vector_len);
1084   }
1085 }
1086 
1087 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1088   if (sign) {
1089     vpmovsxbd(dst, src, vector_len);
1090   } else {
1091     vpmovzxbd(dst, src, vector_len);
1092   }
1093 }
1094 
1095 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1096   if (sign) {
1097     vpmovsxwd(dst, src, vector_len);
1098   } else {
1099     vpmovzxwd(dst, src, vector_len);
1100   }
1101 }
1102 
1103 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1104                                      int shift, int vector_len) {
1105   if (opcode == Op_RotateLeftV) {
1106     if (etype == T_INT) {
1107       evprold(dst, src, shift, vector_len);
1108     } else {
1109       assert(etype == T_LONG, "expected type T_LONG");
1110       evprolq(dst, src, shift, vector_len);
1111     }
1112   } else {
1113     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1114     if (etype == T_INT) {
1115       evprord(dst, src, shift, vector_len);
1116     } else {
1117       assert(etype == T_LONG, "expected type T_LONG");
1118       evprorq(dst, src, shift, vector_len);
1119     }
1120   }
1121 }
1122 
1123 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1124                                      XMMRegister shift, int vector_len) {
1125   if (opcode == Op_RotateLeftV) {
1126     if (etype == T_INT) {
1127       evprolvd(dst, src, shift, vector_len);
1128     } else {
1129       assert(etype == T_LONG, "expected type T_LONG");
1130       evprolvq(dst, src, shift, vector_len);
1131     }
1132   } else {
1133     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1134     if (etype == T_INT) {
1135       evprorvd(dst, src, shift, vector_len);
1136     } else {
1137       assert(etype == T_LONG, "expected type T_LONG");
1138       evprorvq(dst, src, shift, vector_len);
1139     }
1140   }
1141 }
1142 
1143 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1144   if (opcode == Op_RShiftVI) {
1145     psrad(dst, shift);
1146   } else if (opcode == Op_LShiftVI) {
1147     pslld(dst, shift);
1148   } else {
1149     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1150     psrld(dst, shift);
1151   }
1152 }
1153 
1154 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1155   switch (opcode) {
1156     case Op_RShiftVI:  psrad(dst, shift); break;
1157     case Op_LShiftVI:  pslld(dst, shift); break;
1158     case Op_URShiftVI: psrld(dst, shift); break;
1159 
1160     default: assert(false, "%s", NodeClassNames[opcode]);
1161   }
1162 }
1163 
1164 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1165   if (opcode == Op_RShiftVI) {
1166     vpsrad(dst, nds, shift, vector_len);
1167   } else if (opcode == Op_LShiftVI) {
1168     vpslld(dst, nds, shift, vector_len);
1169   } else {
1170     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1171     vpsrld(dst, nds, shift, vector_len);
1172   }
1173 }
1174 
1175 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1176   switch (opcode) {
1177     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1178     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1179     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1180 
1181     default: assert(false, "%s", NodeClassNames[opcode]);
1182   }
1183 }
1184 
1185 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1186   switch (opcode) {
1187     case Op_RShiftVB:  // fall-through
1188     case Op_RShiftVS:  psraw(dst, shift); break;
1189 
1190     case Op_LShiftVB:  // fall-through
1191     case Op_LShiftVS:  psllw(dst, shift);   break;
1192 
1193     case Op_URShiftVS: // fall-through
1194     case Op_URShiftVB: psrlw(dst, shift);  break;
1195 
1196     default: assert(false, "%s", NodeClassNames[opcode]);
1197   }
1198 }
1199 
1200 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1201   switch (opcode) {
1202     case Op_RShiftVB:  // fall-through
1203     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1204 
1205     case Op_LShiftVB:  // fall-through
1206     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1207 
1208     case Op_URShiftVS: // fall-through
1209     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1210 
1211     default: assert(false, "%s", NodeClassNames[opcode]);
1212   }
1213 }
1214 
1215 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1216   switch (opcode) {
1217     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1218     case Op_LShiftVL:  psllq(dst, shift); break;
1219     case Op_URShiftVL: psrlq(dst, shift); break;
1220 
1221     default: assert(false, "%s", NodeClassNames[opcode]);
1222   }
1223 }
1224 
1225 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1226   if (opcode == Op_RShiftVL) {
1227     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1228   } else if (opcode == Op_LShiftVL) {
1229     psllq(dst, shift);
1230   } else {
1231     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1232     psrlq(dst, shift);
1233   }
1234 }
1235 
1236 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1237   switch (opcode) {
1238     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1239     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1240     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1241 
1242     default: assert(false, "%s", NodeClassNames[opcode]);
1243   }
1244 }
1245 
1246 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1247   if (opcode == Op_RShiftVL) {
1248     evpsraq(dst, nds, shift, vector_len);
1249   } else if (opcode == Op_LShiftVL) {
1250     vpsllq(dst, nds, shift, vector_len);
1251   } else {
1252     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1253     vpsrlq(dst, nds, shift, vector_len);
1254   }
1255 }
1256 
1257 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1258   switch (opcode) {
1259     case Op_RShiftVB:  // fall-through
1260     case Op_RShiftVS:  // fall-through
1261     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1262 
1263     case Op_LShiftVB:  // fall-through
1264     case Op_LShiftVS:  // fall-through
1265     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1266 
1267     case Op_URShiftVB: // fall-through
1268     case Op_URShiftVS: // fall-through
1269     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1270 
1271     default: assert(false, "%s", NodeClassNames[opcode]);
1272   }
1273 }
1274 
1275 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1276   switch (opcode) {
1277     case Op_RShiftVB:  // fall-through
1278     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1279 
1280     case Op_LShiftVB:  // fall-through
1281     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1282 
1283     case Op_URShiftVB: // fall-through
1284     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1285 
1286     default: assert(false, "%s", NodeClassNames[opcode]);
1287   }
1288 }
1289 
1290 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1291   assert(UseAVX >= 2, "required");
1292   switch (opcode) {
1293     case Op_RShiftVL: {
1294       if (UseAVX > 2) {
1295         assert(tmp == xnoreg, "not used");
1296         if (!VM_Version::supports_avx512vl()) {
1297           vlen_enc = Assembler::AVX_512bit;
1298         }
1299         evpsravq(dst, src, shift, vlen_enc);
1300       } else {
1301         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1302         vpsrlvq(dst, src, shift, vlen_enc);
1303         vpsrlvq(tmp, tmp, shift, vlen_enc);
1304         vpxor(dst, dst, tmp, vlen_enc);
1305         vpsubq(dst, dst, tmp, vlen_enc);
1306       }
1307       break;
1308     }
1309     case Op_LShiftVL: {
1310       assert(tmp == xnoreg, "not used");
1311       vpsllvq(dst, src, shift, vlen_enc);
1312       break;
1313     }
1314     case Op_URShiftVL: {
1315       assert(tmp == xnoreg, "not used");
1316       vpsrlvq(dst, src, shift, vlen_enc);
1317       break;
1318     }
1319     default: assert(false, "%s", NodeClassNames[opcode]);
1320   }
1321 }
1322 
1323 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1324 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1325   assert(opcode == Op_LShiftVB ||
1326          opcode == Op_RShiftVB ||
1327          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1328   bool sign = (opcode != Op_URShiftVB);
1329   assert(vector_len == 0, "required");
1330   vextendbd(sign, dst, src, 1);
1331   vpmovzxbd(vtmp, shift, 1);
1332   varshiftd(opcode, dst, dst, vtmp, 1);
1333   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1334   vextracti128_high(vtmp, dst);
1335   vpackusdw(dst, dst, vtmp, 0);
1336 }
1337 
1338 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1339 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1340   assert(opcode == Op_LShiftVB ||
1341          opcode == Op_RShiftVB ||
1342          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1343   bool sign = (opcode != Op_URShiftVB);
1344   int ext_vector_len = vector_len + 1;
1345   vextendbw(sign, dst, src, ext_vector_len);
1346   vpmovzxbw(vtmp, shift, ext_vector_len);
1347   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1348   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1349   if (vector_len == 0) {
1350     vextracti128_high(vtmp, dst);
1351     vpackuswb(dst, dst, vtmp, vector_len);
1352   } else {
1353     vextracti64x4_high(vtmp, dst);
1354     vpackuswb(dst, dst, vtmp, vector_len);
1355     vpermq(dst, dst, 0xD8, vector_len);
1356   }
1357 }
1358 
1359 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1360   switch(typ) {
1361     case T_BYTE:
1362       pinsrb(dst, val, idx);
1363       break;
1364     case T_SHORT:
1365       pinsrw(dst, val, idx);
1366       break;
1367     case T_INT:
1368       pinsrd(dst, val, idx);
1369       break;
1370     case T_LONG:
1371       pinsrq(dst, val, idx);
1372       break;
1373     default:
1374       assert(false,"Should not reach here.");
1375       break;
1376   }
1377 }
1378 
1379 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1380   switch(typ) {
1381     case T_BYTE:
1382       vpinsrb(dst, src, val, idx);
1383       break;
1384     case T_SHORT:
1385       vpinsrw(dst, src, val, idx);
1386       break;
1387     case T_INT:
1388       vpinsrd(dst, src, val, idx);
1389       break;
1390     case T_LONG:
1391       vpinsrq(dst, src, val, idx);
1392       break;
1393     default:
1394       assert(false,"Should not reach here.");
1395       break;
1396   }
1397 }
1398 
1399 void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst,
1400                                          Register base, Register idx_base,
1401                                          Register mask, Register mask_idx,
1402                                          Register rtmp, int vlen_enc) {
1403   vpxor(dst, dst, dst, vlen_enc);
1404   if (elem_bt == T_SHORT) {
1405     for (int i = 0; i < 4; i++) {
1406       // dst[i] = mask[i] ? src[idx_base[i]] : 0
1407       Label skip_load;
1408       btq(mask, mask_idx);
1409       jccb(Assembler::carryClear, skip_load);
1410       movl(rtmp, Address(idx_base, i * 4));
1411       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1412       bind(skip_load);
1413       incq(mask_idx);
1414     }
1415   } else {
1416     assert(elem_bt == T_BYTE, "");
1417     for (int i = 0; i < 8; i++) {
1418       // dst[i] = mask[i] ? src[idx_base[i]] : 0
1419       Label skip_load;
1420       btq(mask, mask_idx);
1421       jccb(Assembler::carryClear, skip_load);
1422       movl(rtmp, Address(idx_base, i * 4));
1423       pinsrb(dst, Address(base, rtmp), i);
1424       bind(skip_load);
1425       incq(mask_idx);
1426     }
1427   }
1428 }
1429 
1430 void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst,
1431                                   Register base, Register idx_base,
1432                                   Register rtmp, int vlen_enc) {
1433   vpxor(dst, dst, dst, vlen_enc);
1434   if (elem_bt == T_SHORT) {
1435     for (int i = 0; i < 4; i++) {
1436       // dst[i] = src[idx_base[i]]
1437       movl(rtmp, Address(idx_base, i * 4));
1438       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1439     }
1440   } else {
1441     assert(elem_bt == T_BYTE, "");
1442     for (int i = 0; i < 8; i++) {
1443       // dst[i] = src[idx_base[i]]
1444       movl(rtmp, Address(idx_base, i * 4));
1445       pinsrb(dst, Address(base, rtmp), i);
1446     }
1447   }
1448 }
1449 
1450 /*
1451  * Gather using hybrid algorithm, first partially unroll scalar loop
1452  * to accumulate values from gather indices into a quad-word(64bit) slice.
1453  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1454  * permutation to place the slice into appropriate vector lane
1455  * locations in destination vector. Following pseudo code describes the
1456  * algorithm in detail:
1457  *
1458  * DST_VEC = ZERO_VEC
1459  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1460  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1461  * FOREACH_ITER:
1462  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1463  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1464  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1465  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1466  *
1467  * With each iteration, doubleword permute indices (0,1) corresponding
1468  * to gathered quadword gets right shifted by two lane positions.
1469  *
1470  */
1471 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1472                                         Register base, Register idx_base,
1473                                         Register mask, XMMRegister xtmp1,
1474                                         XMMRegister xtmp2, XMMRegister temp_dst,
1475                                         Register rtmp, Register mask_idx,
1476                                         Register length, int vector_len, int vlen_enc) {
1477   Label GATHER8_LOOP;
1478   assert(is_subword_type(elem_ty), "");
1479   movl(length, vector_len);
1480   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1481   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1482   vallones(xtmp2, vlen_enc);
1483   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1484   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1485   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1486 
1487   bind(GATHER8_LOOP);
1488     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1489     if (mask == noreg) {
1490       vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc);
1491     } else {
1492       vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc);
1493     }
1494     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1495     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1496     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1497     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1498     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1499     vpor(dst, dst, temp_dst, vlen_enc);
1500     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1501     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1502     jcc(Assembler::notEqual, GATHER8_LOOP);
1503 }
1504 
1505 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1506   switch(typ) {
1507     case T_INT:
1508       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1509       break;
1510     case T_FLOAT:
1511       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1512       break;
1513     case T_LONG:
1514       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1515       break;
1516     case T_DOUBLE:
1517       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1518       break;
1519     default:
1520       assert(false,"Should not reach here.");
1521       break;
1522   }
1523 }
1524 
1525 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1526   switch(typ) {
1527     case T_INT:
1528       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1529       break;
1530     case T_FLOAT:
1531       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1532       break;
1533     case T_LONG:
1534       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1535       break;
1536     case T_DOUBLE:
1537       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1538       break;
1539     default:
1540       assert(false,"Should not reach here.");
1541       break;
1542   }
1543 }
1544 
1545 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1546   switch(typ) {
1547     case T_INT:
1548       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1549       break;
1550     case T_FLOAT:
1551       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1552       break;
1553     case T_LONG:
1554       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1555       break;
1556     case T_DOUBLE:
1557       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1558       break;
1559     default:
1560       assert(false,"Should not reach here.");
1561       break;
1562   }
1563 }
1564 
1565 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1566   if (vlen_in_bytes <= 16) {
1567     pxor (dst, dst);
1568     psubb(dst, src);
1569     switch (elem_bt) {
1570       case T_BYTE:   /* nothing to do */ break;
1571       case T_SHORT:  pmovsxbw(dst, dst); break;
1572       case T_INT:    pmovsxbd(dst, dst); break;
1573       case T_FLOAT:  pmovsxbd(dst, dst); break;
1574       case T_LONG:   pmovsxbq(dst, dst); break;
1575       case T_DOUBLE: pmovsxbq(dst, dst); break;
1576 
1577       default: assert(false, "%s", type2name(elem_bt));
1578     }
1579   } else {
1580     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1581     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1582 
1583     vpxor (dst, dst, dst, vlen_enc);
1584     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1585 
1586     switch (elem_bt) {
1587       case T_BYTE:   /* nothing to do */            break;
1588       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1589       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1590       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1591       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1592       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1593 
1594       default: assert(false, "%s", type2name(elem_bt));
1595     }
1596   }
1597 }
1598 
1599 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1600   if (novlbwdq) {
1601     vpmovsxbd(xtmp, src, vlen_enc);
1602     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1603             Assembler::eq, true, vlen_enc, noreg);
1604   } else {
1605     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1606     vpsubb(xtmp, xtmp, src, vlen_enc);
1607     evpmovb2m(dst, xtmp, vlen_enc);
1608   }
1609 }
1610 
1611 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1612   if (is_integral_type(bt)) {
1613     switch (vlen_in_bytes) {
1614       case 4:  movdl(dst, src);   break;
1615       case 8:  movq(dst, src);    break;
1616       case 16: movdqu(dst, src);  break;
1617       case 32: vmovdqu(dst, src); break;
1618       case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1619       default: ShouldNotReachHere();
1620     }
1621   } else {
1622     switch (vlen_in_bytes) {
1623       case 4:  movflt(dst, src); break;
1624       case 8:  movdbl(dst, src); break;
1625       case 16: movups(dst, src); break;
1626       case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1627       case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1628       default: ShouldNotReachHere();
1629     }
1630   }
1631 }
1632 
1633 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1634   assert(rscratch != noreg || always_reachable(src), "missing");
1635 
1636   if (reachable(src)) {
1637     load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1638   } else {
1639     lea(rscratch, src);
1640     load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1641   }
1642 }
1643 
1644 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1645   int vlen_enc = vector_length_encoding(vlen);
1646   if (VM_Version::supports_avx()) {
1647     if (bt == T_LONG) {
1648       if (VM_Version::supports_avx2()) {
1649         vpbroadcastq(dst, src, vlen_enc);
1650       } else {
1651         vmovddup(dst, src, vlen_enc);
1652       }
1653     } else if (bt == T_DOUBLE) {
1654       if (vlen_enc != Assembler::AVX_128bit) {
1655         vbroadcastsd(dst, src, vlen_enc, noreg);
1656       } else {
1657         vmovddup(dst, src, vlen_enc);
1658       }
1659     } else {
1660       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1661         vpbroadcastd(dst, src, vlen_enc);
1662       } else {
1663         vbroadcastss(dst, src, vlen_enc);
1664       }
1665     }
1666   } else if (VM_Version::supports_sse3()) {
1667     movddup(dst, src);
1668   } else {
1669     load_vector(bt, dst, src, vlen);
1670   }
1671 }
1672 
1673 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1674   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1675   int offset = exact_log2(type2aelembytes(bt)) << 6;
1676   if (is_floating_point_type(bt)) {
1677     offset += 128;
1678   }
1679   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1680   load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1681 }
1682 
1683 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1684 
1685 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1686   int vector_len = Assembler::AVX_128bit;
1687 
1688   switch (opcode) {
1689     case Op_AndReductionV:  pand(dst, src); break;
1690     case Op_OrReductionV:   por (dst, src); break;
1691     case Op_XorReductionV:  pxor(dst, src); break;
1692     case Op_MinReductionV:
1693       switch (typ) {
1694         case T_BYTE:        pminsb(dst, src); break;
1695         case T_SHORT:       pminsw(dst, src); break;
1696         case T_INT:         pminsd(dst, src); break;
1697         case T_LONG:        assert(UseAVX > 2, "required");
1698                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1699         default:            assert(false, "wrong type");
1700       }
1701       break;
1702     case Op_MaxReductionV:
1703       switch (typ) {
1704         case T_BYTE:        pmaxsb(dst, src); break;
1705         case T_SHORT:       pmaxsw(dst, src); break;
1706         case T_INT:         pmaxsd(dst, src); break;
1707         case T_LONG:        assert(UseAVX > 2, "required");
1708                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1709         default:            assert(false, "wrong type");
1710       }
1711       break;
1712     case Op_AddReductionVF: addss(dst, src); break;
1713     case Op_AddReductionVD: addsd(dst, src); break;
1714     case Op_AddReductionVI:
1715       switch (typ) {
1716         case T_BYTE:        paddb(dst, src); break;
1717         case T_SHORT:       paddw(dst, src); break;
1718         case T_INT:         paddd(dst, src); break;
1719         default:            assert(false, "wrong type");
1720       }
1721       break;
1722     case Op_AddReductionVL: paddq(dst, src); break;
1723     case Op_MulReductionVF: mulss(dst, src); break;
1724     case Op_MulReductionVD: mulsd(dst, src); break;
1725     case Op_MulReductionVI:
1726       switch (typ) {
1727         case T_SHORT:       pmullw(dst, src); break;
1728         case T_INT:         pmulld(dst, src); break;
1729         default:            assert(false, "wrong type");
1730       }
1731       break;
1732     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1733                             evpmullq(dst, dst, src, vector_len); break;
1734     default:                assert(false, "wrong opcode");
1735   }
1736 }
1737 
1738 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1739   switch (opcode) {
1740     case Op_AddReductionVF: addps(dst, src); break;
1741     case Op_AddReductionVD: addpd(dst, src); break;
1742     case Op_MulReductionVF: mulps(dst, src); break;
1743     case Op_MulReductionVD: mulpd(dst, src); break;
1744     default:                assert(false, "%s", NodeClassNames[opcode]);
1745   }
1746 }
1747 
1748 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1749   int vector_len = Assembler::AVX_256bit;
1750 
1751   switch (opcode) {
1752     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1753     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1754     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1755     case Op_MinReductionV:
1756       switch (typ) {
1757         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1758         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1759         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1760         case T_LONG:        assert(UseAVX > 2, "required");
1761                             vpminsq(dst, src1, src2, vector_len); break;
1762         default:            assert(false, "wrong type");
1763       }
1764       break;
1765     case Op_MaxReductionV:
1766       switch (typ) {
1767         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1768         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1769         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1770         case T_LONG:        assert(UseAVX > 2, "required");
1771                             vpmaxsq(dst, src1, src2, vector_len); break;
1772         default:            assert(false, "wrong type");
1773       }
1774       break;
1775     case Op_AddReductionVI:
1776       switch (typ) {
1777         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1778         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1779         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1780         default:            assert(false, "wrong type");
1781       }
1782       break;
1783     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1784     case Op_MulReductionVI:
1785       switch (typ) {
1786         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1787         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1788         default:            assert(false, "wrong type");
1789       }
1790       break;
1791     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1792     default:                assert(false, "wrong opcode");
1793   }
1794 }
1795 
1796 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1797   int vector_len = Assembler::AVX_256bit;
1798 
1799   switch (opcode) {
1800     case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1801     case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1802     case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1803     case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1804     default:                assert(false, "%s", NodeClassNames[opcode]);
1805   }
1806 }
1807 
1808 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1809                                   XMMRegister dst, XMMRegister src,
1810                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1811   switch (opcode) {
1812     case Op_AddReductionVF:
1813     case Op_MulReductionVF:
1814       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1815       break;
1816 
1817     case Op_AddReductionVD:
1818     case Op_MulReductionVD:
1819       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1820       break;
1821 
1822     default: assert(false, "wrong opcode");
1823   }
1824 }
1825 
1826 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1827                                             XMMRegister dst, XMMRegister src,
1828                                             XMMRegister vtmp1, XMMRegister vtmp2) {
1829   switch (opcode) {
1830     case Op_AddReductionVF:
1831     case Op_MulReductionVF:
1832       unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1833       break;
1834 
1835     case Op_AddReductionVD:
1836     case Op_MulReductionVD:
1837       unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1838       break;
1839 
1840     default: assert(false, "%s", NodeClassNames[opcode]);
1841   }
1842 }
1843 
1844 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1845                              Register dst, Register src1, XMMRegister src2,
1846                              XMMRegister vtmp1, XMMRegister vtmp2) {
1847   switch (vlen) {
1848     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1849     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1850     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1851     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1852 
1853     default: assert(false, "wrong vector length");
1854   }
1855 }
1856 
1857 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1858                              Register dst, Register src1, XMMRegister src2,
1859                              XMMRegister vtmp1, XMMRegister vtmp2) {
1860   switch (vlen) {
1861     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1862     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1863     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1864     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1865 
1866     default: assert(false, "wrong vector length");
1867   }
1868 }
1869 
1870 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1871                              Register dst, Register src1, XMMRegister src2,
1872                              XMMRegister vtmp1, XMMRegister vtmp2) {
1873   switch (vlen) {
1874     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1875     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1876     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1877     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1878 
1879     default: assert(false, "wrong vector length");
1880   }
1881 }
1882 
1883 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1884                              Register dst, Register src1, XMMRegister src2,
1885                              XMMRegister vtmp1, XMMRegister vtmp2) {
1886   switch (vlen) {
1887     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1888     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1889     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1890     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1891 
1892     default: assert(false, "wrong vector length");
1893   }
1894 }
1895 
1896 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1897                              Register dst, Register src1, XMMRegister src2,
1898                              XMMRegister vtmp1, XMMRegister vtmp2) {
1899   switch (vlen) {
1900     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1901     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1902     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1903 
1904     default: assert(false, "wrong vector length");
1905   }
1906 }
1907 
1908 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1909   switch (vlen) {
1910     case 2:
1911       assert(vtmp2 == xnoreg, "");
1912       reduce2F(opcode, dst, src, vtmp1);
1913       break;
1914     case 4:
1915       assert(vtmp2 == xnoreg, "");
1916       reduce4F(opcode, dst, src, vtmp1);
1917       break;
1918     case 8:
1919       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1920       break;
1921     case 16:
1922       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1923       break;
1924     default: assert(false, "wrong vector length");
1925   }
1926 }
1927 
1928 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1929   switch (vlen) {
1930     case 2:
1931       assert(vtmp2 == xnoreg, "");
1932       reduce2D(opcode, dst, src, vtmp1);
1933       break;
1934     case 4:
1935       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1936       break;
1937     case 8:
1938       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1939       break;
1940     default: assert(false, "wrong vector length");
1941   }
1942 }
1943 
1944 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1945   switch (vlen) {
1946     case 2:
1947       assert(vtmp1 == xnoreg, "");
1948       assert(vtmp2 == xnoreg, "");
1949       unorderedReduce2F(opcode, dst, src);
1950       break;
1951     case 4:
1952       assert(vtmp2 == xnoreg, "");
1953       unorderedReduce4F(opcode, dst, src, vtmp1);
1954       break;
1955     case 8:
1956       unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
1957       break;
1958     case 16:
1959       unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
1960       break;
1961     default: assert(false, "wrong vector length");
1962   }
1963 }
1964 
1965 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1966   switch (vlen) {
1967     case 2:
1968       assert(vtmp1 == xnoreg, "");
1969       assert(vtmp2 == xnoreg, "");
1970       unorderedReduce2D(opcode, dst, src);
1971       break;
1972     case 4:
1973       assert(vtmp2 == xnoreg, "");
1974       unorderedReduce4D(opcode, dst, src, vtmp1);
1975       break;
1976     case 8:
1977       unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
1978       break;
1979     default: assert(false, "wrong vector length");
1980   }
1981 }
1982 
1983 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1984   if (opcode == Op_AddReductionVI) {
1985     if (vtmp1 != src2) {
1986       movdqu(vtmp1, src2);
1987     }
1988     phaddd(vtmp1, vtmp1);
1989   } else {
1990     pshufd(vtmp1, src2, 0x1);
1991     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1992   }
1993   movdl(vtmp2, src1);
1994   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1995   movdl(dst, vtmp1);
1996 }
1997 
1998 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1999   if (opcode == Op_AddReductionVI) {
2000     if (vtmp1 != src2) {
2001       movdqu(vtmp1, src2);
2002     }
2003     phaddd(vtmp1, src2);
2004     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2005   } else {
2006     pshufd(vtmp2, src2, 0xE);
2007     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2008     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2009   }
2010 }
2011 
2012 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2013   if (opcode == Op_AddReductionVI) {
2014     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2015     vextracti128_high(vtmp2, vtmp1);
2016     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2017     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2018   } else {
2019     vextracti128_high(vtmp1, src2);
2020     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2021     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2022   }
2023 }
2024 
2025 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2026   vextracti64x4_high(vtmp2, src2);
2027   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2028   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2029 }
2030 
2031 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2032   pshufd(vtmp2, src2, 0x1);
2033   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2034   movdqu(vtmp1, vtmp2);
2035   psrldq(vtmp1, 2);
2036   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2037   movdqu(vtmp2, vtmp1);
2038   psrldq(vtmp2, 1);
2039   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2040   movdl(vtmp2, src1);
2041   pmovsxbd(vtmp1, vtmp1);
2042   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2043   pextrb(dst, vtmp1, 0x0);
2044   movsbl(dst, dst);
2045 }
2046 
2047 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2048   pshufd(vtmp1, src2, 0xE);
2049   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2050   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2051 }
2052 
2053 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2054   vextracti128_high(vtmp2, src2);
2055   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2056   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2057 }
2058 
2059 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2060   vextracti64x4_high(vtmp1, src2);
2061   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2062   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2063 }
2064 
2065 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2066   pmovsxbw(vtmp2, src2);
2067   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2068 }
2069 
2070 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2071   if (UseAVX > 1) {
2072     int vector_len = Assembler::AVX_256bit;
2073     vpmovsxbw(vtmp1, src2, vector_len);
2074     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2075   } else {
2076     pmovsxbw(vtmp2, src2);
2077     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2078     pshufd(vtmp2, src2, 0x1);
2079     pmovsxbw(vtmp2, src2);
2080     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2081   }
2082 }
2083 
2084 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2085   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2086     int vector_len = Assembler::AVX_512bit;
2087     vpmovsxbw(vtmp1, src2, vector_len);
2088     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2089   } else {
2090     assert(UseAVX >= 2,"Should not reach here.");
2091     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2092     vextracti128_high(vtmp2, src2);
2093     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2094   }
2095 }
2096 
2097 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2098   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2099   vextracti64x4_high(vtmp2, src2);
2100   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2101 }
2102 
2103 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2104   if (opcode == Op_AddReductionVI) {
2105     if (vtmp1 != src2) {
2106       movdqu(vtmp1, src2);
2107     }
2108     phaddw(vtmp1, vtmp1);
2109     phaddw(vtmp1, vtmp1);
2110   } else {
2111     pshufd(vtmp2, src2, 0x1);
2112     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2113     movdqu(vtmp1, vtmp2);
2114     psrldq(vtmp1, 2);
2115     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2116   }
2117   movdl(vtmp2, src1);
2118   pmovsxwd(vtmp1, vtmp1);
2119   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2120   pextrw(dst, vtmp1, 0x0);
2121   movswl(dst, dst);
2122 }
2123 
2124 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2125   if (opcode == Op_AddReductionVI) {
2126     if (vtmp1 != src2) {
2127       movdqu(vtmp1, src2);
2128     }
2129     phaddw(vtmp1, src2);
2130   } else {
2131     pshufd(vtmp1, src2, 0xE);
2132     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2133   }
2134   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2135 }
2136 
2137 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2138   if (opcode == Op_AddReductionVI) {
2139     int vector_len = Assembler::AVX_256bit;
2140     vphaddw(vtmp2, src2, src2, vector_len);
2141     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2142   } else {
2143     vextracti128_high(vtmp2, src2);
2144     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2145   }
2146   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2147 }
2148 
2149 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2150   int vector_len = Assembler::AVX_256bit;
2151   vextracti64x4_high(vtmp1, src2);
2152   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2153   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2154 }
2155 
2156 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2157   pshufd(vtmp2, src2, 0xE);
2158   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2159   movdq(vtmp1, src1);
2160   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2161   movdq(dst, vtmp1);
2162 }
2163 
2164 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2165   vextracti128_high(vtmp1, src2);
2166   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2167   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2168 }
2169 
2170 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2171   vextracti64x4_high(vtmp2, src2);
2172   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2173   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2174 }
2175 
2176 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2177   mov64(temp, -1L);
2178   bzhiq(temp, temp, len);
2179   kmovql(dst, temp);
2180 }
2181 
2182 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2183   reduce_operation_128(T_FLOAT, opcode, dst, src);
2184   pshufd(vtmp, src, 0x1);
2185   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2186 }
2187 
2188 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2189   reduce2F(opcode, dst, src, vtmp);
2190   pshufd(vtmp, src, 0x2);
2191   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2192   pshufd(vtmp, src, 0x3);
2193   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2194 }
2195 
2196 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2197   reduce4F(opcode, dst, src, vtmp2);
2198   vextractf128_high(vtmp2, src);
2199   reduce4F(opcode, dst, vtmp2, vtmp1);
2200 }
2201 
2202 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2203   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2204   vextracti64x4_high(vtmp1, src);
2205   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2206 }
2207 
2208 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2209   pshufd(dst, src, 0x1);
2210   reduce_operation_128(T_FLOAT, opcode, dst, src);
2211 }
2212 
2213 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2214   pshufd(vtmp, src, 0xE);
2215   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2216   unorderedReduce2F(opcode, dst, vtmp);
2217 }
2218 
2219 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2220   vextractf128_high(vtmp1, src);
2221   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2222   unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2223 }
2224 
2225 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2226   vextractf64x4_high(vtmp2, src);
2227   unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2228   unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2229 }
2230 
2231 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2232   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2233   pshufd(vtmp, src, 0xE);
2234   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2235 }
2236 
2237 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2238   reduce2D(opcode, dst, src, vtmp2);
2239   vextractf128_high(vtmp2, src);
2240   reduce2D(opcode, dst, vtmp2, vtmp1);
2241 }
2242 
2243 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2244   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2245   vextracti64x4_high(vtmp1, src);
2246   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2247 }
2248 
2249 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2250   pshufd(dst, src, 0xE);
2251   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2252 }
2253 
2254 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2255   vextractf128_high(vtmp, src);
2256   unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2257   unorderedReduce2D(opcode, dst, vtmp);
2258 }
2259 
2260 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2261   vextractf64x4_high(vtmp2, src);
2262   unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2263   unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2264 }
2265 
2266 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2267   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2268 }
2269 
2270 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2271   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2272 }
2273 
2274 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2275   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2276 }
2277 
2278 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2279                                  int vec_enc) {
2280   switch(elem_bt) {
2281     case T_INT:
2282     case T_FLOAT:
2283       vmaskmovps(dst, src, mask, vec_enc);
2284       break;
2285     case T_LONG:
2286     case T_DOUBLE:
2287       vmaskmovpd(dst, src, mask, vec_enc);
2288       break;
2289     default:
2290       fatal("Unsupported type %s", type2name(elem_bt));
2291       break;
2292   }
2293 }
2294 
2295 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2296                                  int vec_enc) {
2297   switch(elem_bt) {
2298     case T_INT:
2299     case T_FLOAT:
2300       vmaskmovps(dst, src, mask, vec_enc);
2301       break;
2302     case T_LONG:
2303     case T_DOUBLE:
2304       vmaskmovpd(dst, src, mask, vec_enc);
2305       break;
2306     default:
2307       fatal("Unsupported type %s", type2name(elem_bt));
2308       break;
2309   }
2310 }
2311 
2312 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2313                                           XMMRegister dst, XMMRegister src,
2314                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2315                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2316   const int permconst[] = {1, 14};
2317   XMMRegister wsrc = src;
2318   XMMRegister wdst = xmm_0;
2319   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2320 
2321   int vlen_enc = Assembler::AVX_128bit;
2322   if (vlen == 16) {
2323     vlen_enc = Assembler::AVX_256bit;
2324   }
2325 
2326   for (int i = log2(vlen) - 1; i >=0; i--) {
2327     if (i == 0 && !is_dst_valid) {
2328       wdst = dst;
2329     }
2330     if (i == 3) {
2331       vextracti64x4_high(wtmp, wsrc);
2332     } else if (i == 2) {
2333       vextracti128_high(wtmp, wsrc);
2334     } else { // i = [0,1]
2335       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2336     }
2337 
2338     if (VM_Version::supports_avx10_2()) {
2339       vminmax_fp(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc);
2340     } else {
2341       vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2342     }
2343     wsrc = wdst;
2344     vlen_enc = Assembler::AVX_128bit;
2345   }
2346   if (is_dst_valid) {
2347     if (VM_Version::supports_avx10_2()) {
2348       vminmax_fp(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit);
2349     } else {
2350       vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2351     }
2352   }
2353 }
2354 
2355 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2356                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2357                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2358   XMMRegister wsrc = src;
2359   XMMRegister wdst = xmm_0;
2360   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2361   int vlen_enc = Assembler::AVX_128bit;
2362   if (vlen == 8) {
2363     vlen_enc = Assembler::AVX_256bit;
2364   }
2365   for (int i = log2(vlen) - 1; i >=0; i--) {
2366     if (i == 0 && !is_dst_valid) {
2367       wdst = dst;
2368     }
2369     if (i == 1) {
2370       vextracti128_high(wtmp, wsrc);
2371     } else if (i == 2) {
2372       vextracti64x4_high(wtmp, wsrc);
2373     } else {
2374       assert(i == 0, "%d", i);
2375       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2376     }
2377 
2378     if (VM_Version::supports_avx10_2()) {
2379       vminmax_fp(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc);
2380     } else {
2381       vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2382     }
2383 
2384     wsrc = wdst;
2385     vlen_enc = Assembler::AVX_128bit;
2386   }
2387 
2388   if (is_dst_valid) {
2389     if (VM_Version::supports_avx10_2()) {
2390       vminmax_fp(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit);
2391     } else {
2392       vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2393     }
2394   }
2395 }
2396 
2397 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2398   switch (bt) {
2399     case T_BYTE:  pextrb(dst, src, idx); break;
2400     case T_SHORT: pextrw(dst, src, idx); break;
2401     case T_INT:   pextrd(dst, src, idx); break;
2402     case T_LONG:  pextrq(dst, src, idx); break;
2403 
2404     default:
2405       assert(false,"Should not reach here.");
2406       break;
2407   }
2408 }
2409 
2410 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2411   int esize =  type2aelembytes(typ);
2412   int elem_per_lane = 16/esize;
2413   int lane = elemindex / elem_per_lane;
2414   int eindex = elemindex % elem_per_lane;
2415 
2416   if (lane >= 2) {
2417     assert(UseAVX > 2, "required");
2418     vextractf32x4(dst, src, lane & 3);
2419     return dst;
2420   } else if (lane > 0) {
2421     assert(UseAVX > 0, "required");
2422     vextractf128(dst, src, lane);
2423     return dst;
2424   } else {
2425     return src;
2426   }
2427 }
2428 
2429 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2430   if (typ == T_BYTE) {
2431     movsbl(dst, dst);
2432   } else if (typ == T_SHORT) {
2433     movswl(dst, dst);
2434   }
2435 }
2436 
2437 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2438   int esize =  type2aelembytes(typ);
2439   int elem_per_lane = 16/esize;
2440   int eindex = elemindex % elem_per_lane;
2441   assert(is_integral_type(typ),"required");
2442 
2443   if (eindex == 0) {
2444     if (typ == T_LONG) {
2445       movq(dst, src);
2446     } else {
2447       movdl(dst, src);
2448       movsxl(typ, dst);
2449     }
2450   } else {
2451     extract(typ, dst, src, eindex);
2452     movsxl(typ, dst);
2453   }
2454 }
2455 
2456 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2457   int esize =  type2aelembytes(typ);
2458   int elem_per_lane = 16/esize;
2459   int eindex = elemindex % elem_per_lane;
2460   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2461 
2462   if (eindex == 0) {
2463     movq(dst, src);
2464   } else {
2465     if (typ == T_FLOAT) {
2466       if (UseAVX == 0) {
2467         movdqu(dst, src);
2468         shufps(dst, dst, eindex);
2469       } else {
2470         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2471       }
2472     } else {
2473       if (UseAVX == 0) {
2474         movdqu(dst, src);
2475         psrldq(dst, eindex*esize);
2476       } else {
2477         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2478       }
2479       movq(dst, dst);
2480     }
2481   }
2482   // Zero upper bits
2483   if (typ == T_FLOAT) {
2484     if (UseAVX == 0) {
2485       assert(vtmp != xnoreg, "required.");
2486       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2487       pand(dst, vtmp);
2488     } else {
2489       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2490     }
2491   }
2492 }
2493 
2494 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2495   switch(typ) {
2496     case T_BYTE:
2497     case T_BOOLEAN:
2498       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2499       break;
2500     case T_SHORT:
2501     case T_CHAR:
2502       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2503       break;
2504     case T_INT:
2505     case T_FLOAT:
2506       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2507       break;
2508     case T_LONG:
2509     case T_DOUBLE:
2510       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2511       break;
2512     default:
2513       assert(false,"Should not reach here.");
2514       break;
2515   }
2516 }
2517 
2518 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2519   assert(rscratch != noreg || always_reachable(src2), "missing");
2520 
2521   switch(typ) {
2522     case T_BOOLEAN:
2523     case T_BYTE:
2524       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2525       break;
2526     case T_CHAR:
2527     case T_SHORT:
2528       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2529       break;
2530     case T_INT:
2531     case T_FLOAT:
2532       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2533       break;
2534     case T_LONG:
2535     case T_DOUBLE:
2536       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2537       break;
2538     default:
2539       assert(false,"Should not reach here.");
2540       break;
2541   }
2542 }
2543 
2544 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2545   switch(typ) {
2546     case T_BYTE:
2547       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2548       break;
2549     case T_SHORT:
2550       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2551       break;
2552     case T_INT:
2553     case T_FLOAT:
2554       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2555       break;
2556     case T_LONG:
2557     case T_DOUBLE:
2558       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2559       break;
2560     default:
2561       assert(false,"Should not reach here.");
2562       break;
2563   }
2564 }
2565 
2566 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2567   assert(vlen_in_bytes <= 32, "");
2568   int esize = type2aelembytes(bt);
2569   if (vlen_in_bytes == 32) {
2570     assert(vtmp == xnoreg, "required.");
2571     if (esize >= 4) {
2572       vtestps(src1, src2, AVX_256bit);
2573     } else {
2574       vptest(src1, src2, AVX_256bit);
2575     }
2576     return;
2577   }
2578   if (vlen_in_bytes < 16) {
2579     // Duplicate the lower part to fill the whole register,
2580     // Don't need to do so for src2
2581     assert(vtmp != xnoreg, "required");
2582     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2583     pshufd(vtmp, src1, shuffle_imm);
2584   } else {
2585     assert(vtmp == xnoreg, "required");
2586     vtmp = src1;
2587   }
2588   if (esize >= 4 && VM_Version::supports_avx()) {
2589     vtestps(vtmp, src2, AVX_128bit);
2590   } else {
2591     ptest(vtmp, src2);
2592   }
2593 }
2594 
2595 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2596 #ifdef ASSERT
2597   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2598   bool is_bw_supported = VM_Version::supports_avx512bw();
2599   if (is_bw && !is_bw_supported) {
2600     assert(vlen_enc != Assembler::AVX_512bit, "required");
2601     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2602            "XMM register should be 0-15");
2603   }
2604 #endif // ASSERT
2605   switch (elem_bt) {
2606     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2607     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2608     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2609     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2610     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2611     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2612     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2613   }
2614 }
2615 
2616 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2617   assert(UseAVX >= 2, "required");
2618   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2619   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2620   if ((UseAVX > 2) &&
2621       (!is_bw || VM_Version::supports_avx512bw()) &&
2622       (!is_vl || VM_Version::supports_avx512vl())) {
2623     switch (elem_bt) {
2624       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2625       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2626       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2627       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2628       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2629     }
2630   } else {
2631     assert(vlen_enc != Assembler::AVX_512bit, "required");
2632     assert((dst->encoding() < 16),"XMM register should be 0-15");
2633     switch (elem_bt) {
2634       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2635       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2636       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2637       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2638       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2639       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2640       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2641     }
2642   }
2643 }
2644 
2645 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2646   switch (to_elem_bt) {
2647     case T_SHORT:
2648       vpmovsxbw(dst, src, vlen_enc);
2649       break;
2650     case T_INT:
2651       vpmovsxbd(dst, src, vlen_enc);
2652       break;
2653     case T_FLOAT:
2654       vpmovsxbd(dst, src, vlen_enc);
2655       vcvtdq2ps(dst, dst, vlen_enc);
2656       break;
2657     case T_LONG:
2658       vpmovsxbq(dst, src, vlen_enc);
2659       break;
2660     case T_DOUBLE: {
2661       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2662       vpmovsxbd(dst, src, mid_vlen_enc);
2663       vcvtdq2pd(dst, dst, vlen_enc);
2664       break;
2665     }
2666     default:
2667       fatal("Unsupported type %s", type2name(to_elem_bt));
2668       break;
2669   }
2670 }
2671 
2672 //-------------------------------------------------------------------------------------------
2673 
2674 // IndexOf for constant substrings with size >= 8 chars
2675 // which don't need to be loaded through stack.
2676 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2677                                          Register cnt1, Register cnt2,
2678                                          int int_cnt2,  Register result,
2679                                          XMMRegister vec, Register tmp,
2680                                          int ae) {
2681   ShortBranchVerifier sbv(this);
2682   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2683   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2684 
2685   // This method uses the pcmpestri instruction with bound registers
2686   //   inputs:
2687   //     xmm - substring
2688   //     rax - substring length (elements count)
2689   //     mem - scanned string
2690   //     rdx - string length (elements count)
2691   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2692   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2693   //   outputs:
2694   //     rcx - matched index in string
2695   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2696   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2697   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2698   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2699   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2700 
2701   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2702         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2703         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2704 
2705   // Note, inline_string_indexOf() generates checks:
2706   // if (substr.count > string.count) return -1;
2707   // if (substr.count == 0) return 0;
2708   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2709 
2710   // Load substring.
2711   if (ae == StrIntrinsicNode::UL) {
2712     pmovzxbw(vec, Address(str2, 0));
2713   } else {
2714     movdqu(vec, Address(str2, 0));
2715   }
2716   movl(cnt2, int_cnt2);
2717   movptr(result, str1); // string addr
2718 
2719   if (int_cnt2 > stride) {
2720     jmpb(SCAN_TO_SUBSTR);
2721 
2722     // Reload substr for rescan, this code
2723     // is executed only for large substrings (> 8 chars)
2724     bind(RELOAD_SUBSTR);
2725     if (ae == StrIntrinsicNode::UL) {
2726       pmovzxbw(vec, Address(str2, 0));
2727     } else {
2728       movdqu(vec, Address(str2, 0));
2729     }
2730     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2731 
2732     bind(RELOAD_STR);
2733     // We came here after the beginning of the substring was
2734     // matched but the rest of it was not so we need to search
2735     // again. Start from the next element after the previous match.
2736 
2737     // cnt2 is number of substring reminding elements and
2738     // cnt1 is number of string reminding elements when cmp failed.
2739     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2740     subl(cnt1, cnt2);
2741     addl(cnt1, int_cnt2);
2742     movl(cnt2, int_cnt2); // Now restore cnt2
2743 
2744     decrementl(cnt1);     // Shift to next element
2745     cmpl(cnt1, cnt2);
2746     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2747 
2748     addptr(result, (1<<scale1));
2749 
2750   } // (int_cnt2 > 8)
2751 
2752   // Scan string for start of substr in 16-byte vectors
2753   bind(SCAN_TO_SUBSTR);
2754   pcmpestri(vec, Address(result, 0), mode);
2755   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2756   subl(cnt1, stride);
2757   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2758   cmpl(cnt1, cnt2);
2759   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2760   addptr(result, 16);
2761   jmpb(SCAN_TO_SUBSTR);
2762 
2763   // Found a potential substr
2764   bind(FOUND_CANDIDATE);
2765   // Matched whole vector if first element matched (tmp(rcx) == 0).
2766   if (int_cnt2 == stride) {
2767     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2768   } else { // int_cnt2 > 8
2769     jccb(Assembler::overflow, FOUND_SUBSTR);
2770   }
2771   // After pcmpestri tmp(rcx) contains matched element index
2772   // Compute start addr of substr
2773   lea(result, Address(result, tmp, scale1));
2774 
2775   // Make sure string is still long enough
2776   subl(cnt1, tmp);
2777   cmpl(cnt1, cnt2);
2778   if (int_cnt2 == stride) {
2779     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2780   } else { // int_cnt2 > 8
2781     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2782   }
2783   // Left less then substring.
2784 
2785   bind(RET_NOT_FOUND);
2786   movl(result, -1);
2787   jmp(EXIT);
2788 
2789   if (int_cnt2 > stride) {
2790     // This code is optimized for the case when whole substring
2791     // is matched if its head is matched.
2792     bind(MATCH_SUBSTR_HEAD);
2793     pcmpestri(vec, Address(result, 0), mode);
2794     // Reload only string if does not match
2795     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2796 
2797     Label CONT_SCAN_SUBSTR;
2798     // Compare the rest of substring (> 8 chars).
2799     bind(FOUND_SUBSTR);
2800     // First 8 chars are already matched.
2801     negptr(cnt2);
2802     addptr(cnt2, stride);
2803 
2804     bind(SCAN_SUBSTR);
2805     subl(cnt1, stride);
2806     cmpl(cnt2, -stride); // Do not read beyond substring
2807     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2808     // Back-up strings to avoid reading beyond substring:
2809     // cnt1 = cnt1 - cnt2 + 8
2810     addl(cnt1, cnt2); // cnt2 is negative
2811     addl(cnt1, stride);
2812     movl(cnt2, stride); negptr(cnt2);
2813     bind(CONT_SCAN_SUBSTR);
2814     if (int_cnt2 < (int)G) {
2815       int tail_off1 = int_cnt2<<scale1;
2816       int tail_off2 = int_cnt2<<scale2;
2817       if (ae == StrIntrinsicNode::UL) {
2818         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2819       } else {
2820         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2821       }
2822       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2823     } else {
2824       // calculate index in register to avoid integer overflow (int_cnt2*2)
2825       movl(tmp, int_cnt2);
2826       addptr(tmp, cnt2);
2827       if (ae == StrIntrinsicNode::UL) {
2828         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2829       } else {
2830         movdqu(vec, Address(str2, tmp, scale2, 0));
2831       }
2832       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2833     }
2834     // Need to reload strings pointers if not matched whole vector
2835     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2836     addptr(cnt2, stride);
2837     jcc(Assembler::negative, SCAN_SUBSTR);
2838     // Fall through if found full substring
2839 
2840   } // (int_cnt2 > 8)
2841 
2842   bind(RET_FOUND);
2843   // Found result if we matched full small substring.
2844   // Compute substr offset
2845   subptr(result, str1);
2846   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2847     shrl(result, 1); // index
2848   }
2849   bind(EXIT);
2850 
2851 } // string_indexofC8
2852 
2853 // Small strings are loaded through stack if they cross page boundary.
2854 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2855                                        Register cnt1, Register cnt2,
2856                                        int int_cnt2,  Register result,
2857                                        XMMRegister vec, Register tmp,
2858                                        int ae) {
2859   ShortBranchVerifier sbv(this);
2860   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2861   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2862 
2863   //
2864   // int_cnt2 is length of small (< 8 chars) constant substring
2865   // or (-1) for non constant substring in which case its length
2866   // is in cnt2 register.
2867   //
2868   // Note, inline_string_indexOf() generates checks:
2869   // if (substr.count > string.count) return -1;
2870   // if (substr.count == 0) return 0;
2871   //
2872   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2873   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2874   // This method uses the pcmpestri instruction with bound registers
2875   //   inputs:
2876   //     xmm - substring
2877   //     rax - substring length (elements count)
2878   //     mem - scanned string
2879   //     rdx - string length (elements count)
2880   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2881   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2882   //   outputs:
2883   //     rcx - matched index in string
2884   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2885   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2886   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2887   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2888 
2889   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2890         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2891         FOUND_CANDIDATE;
2892 
2893   { //========================================================
2894     // We don't know where these strings are located
2895     // and we can't read beyond them. Load them through stack.
2896     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2897 
2898     movptr(tmp, rsp); // save old SP
2899 
2900     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2901       if (int_cnt2 == (1>>scale2)) { // One byte
2902         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2903         load_unsigned_byte(result, Address(str2, 0));
2904         movdl(vec, result); // move 32 bits
2905       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2906         // Not enough header space in 32-bit VM: 12+3 = 15.
2907         movl(result, Address(str2, -1));
2908         shrl(result, 8);
2909         movdl(vec, result); // move 32 bits
2910       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2911         load_unsigned_short(result, Address(str2, 0));
2912         movdl(vec, result); // move 32 bits
2913       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2914         movdl(vec, Address(str2, 0)); // move 32 bits
2915       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2916         movq(vec, Address(str2, 0));  // move 64 bits
2917       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2918         // Array header size is 12 bytes in 32-bit VM
2919         // + 6 bytes for 3 chars == 18 bytes,
2920         // enough space to load vec and shift.
2921         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2922         if (ae == StrIntrinsicNode::UL) {
2923           int tail_off = int_cnt2-8;
2924           pmovzxbw(vec, Address(str2, tail_off));
2925           psrldq(vec, -2*tail_off);
2926         }
2927         else {
2928           int tail_off = int_cnt2*(1<<scale2);
2929           movdqu(vec, Address(str2, tail_off-16));
2930           psrldq(vec, 16-tail_off);
2931         }
2932       }
2933     } else { // not constant substring
2934       cmpl(cnt2, stride);
2935       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2936 
2937       // We can read beyond string if srt+16 does not cross page boundary
2938       // since heaps are aligned and mapped by pages.
2939       assert(os::vm_page_size() < (int)G, "default page should be small");
2940       movl(result, str2); // We need only low 32 bits
2941       andl(result, ((int)os::vm_page_size()-1));
2942       cmpl(result, ((int)os::vm_page_size()-16));
2943       jccb(Assembler::belowEqual, CHECK_STR);
2944 
2945       // Move small strings to stack to allow load 16 bytes into vec.
2946       subptr(rsp, 16);
2947       int stk_offset = wordSize-(1<<scale2);
2948       push(cnt2);
2949 
2950       bind(COPY_SUBSTR);
2951       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2952         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2953         movb(Address(rsp, cnt2, scale2, stk_offset), result);
2954       } else if (ae == StrIntrinsicNode::UU) {
2955         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2956         movw(Address(rsp, cnt2, scale2, stk_offset), result);
2957       }
2958       decrement(cnt2);
2959       jccb(Assembler::notZero, COPY_SUBSTR);
2960 
2961       pop(cnt2);
2962       movptr(str2, rsp);  // New substring address
2963     } // non constant
2964 
2965     bind(CHECK_STR);
2966     cmpl(cnt1, stride);
2967     jccb(Assembler::aboveEqual, BIG_STRINGS);
2968 
2969     // Check cross page boundary.
2970     movl(result, str1); // We need only low 32 bits
2971     andl(result, ((int)os::vm_page_size()-1));
2972     cmpl(result, ((int)os::vm_page_size()-16));
2973     jccb(Assembler::belowEqual, BIG_STRINGS);
2974 
2975     subptr(rsp, 16);
2976     int stk_offset = -(1<<scale1);
2977     if (int_cnt2 < 0) { // not constant
2978       push(cnt2);
2979       stk_offset += wordSize;
2980     }
2981     movl(cnt2, cnt1);
2982 
2983     bind(COPY_STR);
2984     if (ae == StrIntrinsicNode::LL) {
2985       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2986       movb(Address(rsp, cnt2, scale1, stk_offset), result);
2987     } else {
2988       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2989       movw(Address(rsp, cnt2, scale1, stk_offset), result);
2990     }
2991     decrement(cnt2);
2992     jccb(Assembler::notZero, COPY_STR);
2993 
2994     if (int_cnt2 < 0) { // not constant
2995       pop(cnt2);
2996     }
2997     movptr(str1, rsp);  // New string address
2998 
2999     bind(BIG_STRINGS);
3000     // Load substring.
3001     if (int_cnt2 < 0) { // -1
3002       if (ae == StrIntrinsicNode::UL) {
3003         pmovzxbw(vec, Address(str2, 0));
3004       } else {
3005         movdqu(vec, Address(str2, 0));
3006       }
3007       push(cnt2);       // substr count
3008       push(str2);       // substr addr
3009       push(str1);       // string addr
3010     } else {
3011       // Small (< 8 chars) constant substrings are loaded already.
3012       movl(cnt2, int_cnt2);
3013     }
3014     push(tmp);  // original SP
3015 
3016   } // Finished loading
3017 
3018   //========================================================
3019   // Start search
3020   //
3021 
3022   movptr(result, str1); // string addr
3023 
3024   if (int_cnt2  < 0) {  // Only for non constant substring
3025     jmpb(SCAN_TO_SUBSTR);
3026 
3027     // SP saved at sp+0
3028     // String saved at sp+1*wordSize
3029     // Substr saved at sp+2*wordSize
3030     // Substr count saved at sp+3*wordSize
3031 
3032     // Reload substr for rescan, this code
3033     // is executed only for large substrings (> 8 chars)
3034     bind(RELOAD_SUBSTR);
3035     movptr(str2, Address(rsp, 2*wordSize));
3036     movl(cnt2, Address(rsp, 3*wordSize));
3037     if (ae == StrIntrinsicNode::UL) {
3038       pmovzxbw(vec, Address(str2, 0));
3039     } else {
3040       movdqu(vec, Address(str2, 0));
3041     }
3042     // We came here after the beginning of the substring was
3043     // matched but the rest of it was not so we need to search
3044     // again. Start from the next element after the previous match.
3045     subptr(str1, result); // Restore counter
3046     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3047       shrl(str1, 1);
3048     }
3049     addl(cnt1, str1);
3050     decrementl(cnt1);   // Shift to next element
3051     cmpl(cnt1, cnt2);
3052     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3053 
3054     addptr(result, (1<<scale1));
3055   } // non constant
3056 
3057   // Scan string for start of substr in 16-byte vectors
3058   bind(SCAN_TO_SUBSTR);
3059   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3060   pcmpestri(vec, Address(result, 0), mode);
3061   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3062   subl(cnt1, stride);
3063   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3064   cmpl(cnt1, cnt2);
3065   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3066   addptr(result, 16);
3067 
3068   bind(ADJUST_STR);
3069   cmpl(cnt1, stride); // Do not read beyond string
3070   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3071   // Back-up string to avoid reading beyond string.
3072   lea(result, Address(result, cnt1, scale1, -16));
3073   movl(cnt1, stride);
3074   jmpb(SCAN_TO_SUBSTR);
3075 
3076   // Found a potential substr
3077   bind(FOUND_CANDIDATE);
3078   // After pcmpestri tmp(rcx) contains matched element index
3079 
3080   // Make sure string is still long enough
3081   subl(cnt1, tmp);
3082   cmpl(cnt1, cnt2);
3083   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3084   // Left less then substring.
3085 
3086   bind(RET_NOT_FOUND);
3087   movl(result, -1);
3088   jmp(CLEANUP);
3089 
3090   bind(FOUND_SUBSTR);
3091   // Compute start addr of substr
3092   lea(result, Address(result, tmp, scale1));
3093   if (int_cnt2 > 0) { // Constant substring
3094     // Repeat search for small substring (< 8 chars)
3095     // from new point without reloading substring.
3096     // Have to check that we don't read beyond string.
3097     cmpl(tmp, stride-int_cnt2);
3098     jccb(Assembler::greater, ADJUST_STR);
3099     // Fall through if matched whole substring.
3100   } else { // non constant
3101     assert(int_cnt2 == -1, "should be != 0");
3102 
3103     addl(tmp, cnt2);
3104     // Found result if we matched whole substring.
3105     cmpl(tmp, stride);
3106     jcc(Assembler::lessEqual, RET_FOUND);
3107 
3108     // Repeat search for small substring (<= 8 chars)
3109     // from new point 'str1' without reloading substring.
3110     cmpl(cnt2, stride);
3111     // Have to check that we don't read beyond string.
3112     jccb(Assembler::lessEqual, ADJUST_STR);
3113 
3114     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3115     // Compare the rest of substring (> 8 chars).
3116     movptr(str1, result);
3117 
3118     cmpl(tmp, cnt2);
3119     // First 8 chars are already matched.
3120     jccb(Assembler::equal, CHECK_NEXT);
3121 
3122     bind(SCAN_SUBSTR);
3123     pcmpestri(vec, Address(str1, 0), mode);
3124     // Need to reload strings pointers if not matched whole vector
3125     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3126 
3127     bind(CHECK_NEXT);
3128     subl(cnt2, stride);
3129     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3130     addptr(str1, 16);
3131     if (ae == StrIntrinsicNode::UL) {
3132       addptr(str2, 8);
3133     } else {
3134       addptr(str2, 16);
3135     }
3136     subl(cnt1, stride);
3137     cmpl(cnt2, stride); // Do not read beyond substring
3138     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3139     // Back-up strings to avoid reading beyond substring.
3140 
3141     if (ae == StrIntrinsicNode::UL) {
3142       lea(str2, Address(str2, cnt2, scale2, -8));
3143       lea(str1, Address(str1, cnt2, scale1, -16));
3144     } else {
3145       lea(str2, Address(str2, cnt2, scale2, -16));
3146       lea(str1, Address(str1, cnt2, scale1, -16));
3147     }
3148     subl(cnt1, cnt2);
3149     movl(cnt2, stride);
3150     addl(cnt1, stride);
3151     bind(CONT_SCAN_SUBSTR);
3152     if (ae == StrIntrinsicNode::UL) {
3153       pmovzxbw(vec, Address(str2, 0));
3154     } else {
3155       movdqu(vec, Address(str2, 0));
3156     }
3157     jmp(SCAN_SUBSTR);
3158 
3159     bind(RET_FOUND_LONG);
3160     movptr(str1, Address(rsp, wordSize));
3161   } // non constant
3162 
3163   bind(RET_FOUND);
3164   // Compute substr offset
3165   subptr(result, str1);
3166   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3167     shrl(result, 1); // index
3168   }
3169   bind(CLEANUP);
3170   pop(rsp); // restore SP
3171 
3172 } // string_indexof
3173 
3174 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3175                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3176   ShortBranchVerifier sbv(this);
3177   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3178 
3179   int stride = 8;
3180 
3181   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3182         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3183         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3184         FOUND_SEQ_CHAR, DONE_LABEL;
3185 
3186   movptr(result, str1);
3187   if (UseAVX >= 2) {
3188     cmpl(cnt1, stride);
3189     jcc(Assembler::less, SCAN_TO_CHAR);
3190     cmpl(cnt1, 2*stride);
3191     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3192     movdl(vec1, ch);
3193     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3194     vpxor(vec2, vec2);
3195     movl(tmp, cnt1);
3196     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3197     andl(cnt1,0x0000000F);  //tail count (in chars)
3198 
3199     bind(SCAN_TO_16_CHAR_LOOP);
3200     vmovdqu(vec3, Address(result, 0));
3201     vpcmpeqw(vec3, vec3, vec1, 1);
3202     vptest(vec2, vec3);
3203     jcc(Assembler::carryClear, FOUND_CHAR);
3204     addptr(result, 32);
3205     subl(tmp, 2*stride);
3206     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3207     jmp(SCAN_TO_8_CHAR);
3208     bind(SCAN_TO_8_CHAR_INIT);
3209     movdl(vec1, ch);
3210     pshuflw(vec1, vec1, 0x00);
3211     pshufd(vec1, vec1, 0);
3212     pxor(vec2, vec2);
3213   }
3214   bind(SCAN_TO_8_CHAR);
3215   cmpl(cnt1, stride);
3216   jcc(Assembler::less, SCAN_TO_CHAR);
3217   if (UseAVX < 2) {
3218     movdl(vec1, ch);
3219     pshuflw(vec1, vec1, 0x00);
3220     pshufd(vec1, vec1, 0);
3221     pxor(vec2, vec2);
3222   }
3223   movl(tmp, cnt1);
3224   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3225   andl(cnt1,0x00000007);  //tail count (in chars)
3226 
3227   bind(SCAN_TO_8_CHAR_LOOP);
3228   movdqu(vec3, Address(result, 0));
3229   pcmpeqw(vec3, vec1);
3230   ptest(vec2, vec3);
3231   jcc(Assembler::carryClear, FOUND_CHAR);
3232   addptr(result, 16);
3233   subl(tmp, stride);
3234   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3235   bind(SCAN_TO_CHAR);
3236   testl(cnt1, cnt1);
3237   jcc(Assembler::zero, RET_NOT_FOUND);
3238   bind(SCAN_TO_CHAR_LOOP);
3239   load_unsigned_short(tmp, Address(result, 0));
3240   cmpl(ch, tmp);
3241   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3242   addptr(result, 2);
3243   subl(cnt1, 1);
3244   jccb(Assembler::zero, RET_NOT_FOUND);
3245   jmp(SCAN_TO_CHAR_LOOP);
3246 
3247   bind(RET_NOT_FOUND);
3248   movl(result, -1);
3249   jmpb(DONE_LABEL);
3250 
3251   bind(FOUND_CHAR);
3252   if (UseAVX >= 2) {
3253     vpmovmskb(tmp, vec3);
3254   } else {
3255     pmovmskb(tmp, vec3);
3256   }
3257   bsfl(ch, tmp);
3258   addptr(result, ch);
3259 
3260   bind(FOUND_SEQ_CHAR);
3261   subptr(result, str1);
3262   shrl(result, 1);
3263 
3264   bind(DONE_LABEL);
3265 } // string_indexof_char
3266 
3267 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3268                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3269   ShortBranchVerifier sbv(this);
3270   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3271 
3272   int stride = 16;
3273 
3274   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3275         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3276         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3277         FOUND_SEQ_CHAR, DONE_LABEL;
3278 
3279   movptr(result, str1);
3280   if (UseAVX >= 2) {
3281     cmpl(cnt1, stride);
3282     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3283     cmpl(cnt1, stride*2);
3284     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3285     movdl(vec1, ch);
3286     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3287     vpxor(vec2, vec2);
3288     movl(tmp, cnt1);
3289     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3290     andl(cnt1,0x0000001F);  //tail count (in chars)
3291 
3292     bind(SCAN_TO_32_CHAR_LOOP);
3293     vmovdqu(vec3, Address(result, 0));
3294     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3295     vptest(vec2, vec3);
3296     jcc(Assembler::carryClear, FOUND_CHAR);
3297     addptr(result, 32);
3298     subl(tmp, stride*2);
3299     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3300     jmp(SCAN_TO_16_CHAR);
3301 
3302     bind(SCAN_TO_16_CHAR_INIT);
3303     movdl(vec1, ch);
3304     pxor(vec2, vec2);
3305     pshufb(vec1, vec2);
3306   }
3307 
3308   bind(SCAN_TO_16_CHAR);
3309   cmpl(cnt1, stride);
3310   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3311   if (UseAVX < 2) {
3312     movdl(vec1, ch);
3313     pxor(vec2, vec2);
3314     pshufb(vec1, vec2);
3315   }
3316   movl(tmp, cnt1);
3317   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3318   andl(cnt1,0x0000000F);  //tail count (in bytes)
3319 
3320   bind(SCAN_TO_16_CHAR_LOOP);
3321   movdqu(vec3, Address(result, 0));
3322   pcmpeqb(vec3, vec1);
3323   ptest(vec2, vec3);
3324   jcc(Assembler::carryClear, FOUND_CHAR);
3325   addptr(result, 16);
3326   subl(tmp, stride);
3327   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3328 
3329   bind(SCAN_TO_CHAR_INIT);
3330   testl(cnt1, cnt1);
3331   jcc(Assembler::zero, RET_NOT_FOUND);
3332   bind(SCAN_TO_CHAR_LOOP);
3333   load_unsigned_byte(tmp, Address(result, 0));
3334   cmpl(ch, tmp);
3335   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3336   addptr(result, 1);
3337   subl(cnt1, 1);
3338   jccb(Assembler::zero, RET_NOT_FOUND);
3339   jmp(SCAN_TO_CHAR_LOOP);
3340 
3341   bind(RET_NOT_FOUND);
3342   movl(result, -1);
3343   jmpb(DONE_LABEL);
3344 
3345   bind(FOUND_CHAR);
3346   if (UseAVX >= 2) {
3347     vpmovmskb(tmp, vec3);
3348   } else {
3349     pmovmskb(tmp, vec3);
3350   }
3351   bsfl(ch, tmp);
3352   addptr(result, ch);
3353 
3354   bind(FOUND_SEQ_CHAR);
3355   subptr(result, str1);
3356 
3357   bind(DONE_LABEL);
3358 } // stringL_indexof_char
3359 
3360 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3361   switch (eltype) {
3362   case T_BOOLEAN: return sizeof(jboolean);
3363   case T_BYTE:  return sizeof(jbyte);
3364   case T_SHORT: return sizeof(jshort);
3365   case T_CHAR:  return sizeof(jchar);
3366   case T_INT:   return sizeof(jint);
3367   default:
3368     ShouldNotReachHere();
3369     return -1;
3370   }
3371 }
3372 
3373 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3374   switch (eltype) {
3375   // T_BOOLEAN used as surrogate for unsigned byte
3376   case T_BOOLEAN: movzbl(dst, src);   break;
3377   case T_BYTE:    movsbl(dst, src);   break;
3378   case T_SHORT:   movswl(dst, src);   break;
3379   case T_CHAR:    movzwl(dst, src);   break;
3380   case T_INT:     movl(dst, src);     break;
3381   default:
3382     ShouldNotReachHere();
3383   }
3384 }
3385 
3386 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3387   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3388 }
3389 
3390 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3391   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3392 }
3393 
3394 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3395   const int vlen = Assembler::AVX_256bit;
3396   switch (eltype) {
3397   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3398   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3399   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3400   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3401   case T_INT:
3402     // do nothing
3403     break;
3404   default:
3405     ShouldNotReachHere();
3406   }
3407 }
3408 
3409 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3410                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3411                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3412                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3413                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3414                                         BasicType eltype) {
3415   ShortBranchVerifier sbv(this);
3416   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3417   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3418   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3419 
3420   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3421         SHORT_UNROLLED_LOOP_EXIT,
3422         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3423         UNROLLED_VECTOR_LOOP_BEGIN,
3424         END;
3425   switch (eltype) {
3426   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3427   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3428   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3429   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3430   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3431   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3432   }
3433 
3434   // For "renaming" for readibility of the code
3435   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3436                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3437                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3438 
3439   const int elsize = arrays_hashcode_elsize(eltype);
3440 
3441   /*
3442     if (cnt1 >= 2) {
3443       if (cnt1 >= 32) {
3444         UNROLLED VECTOR LOOP
3445       }
3446       UNROLLED SCALAR LOOP
3447     }
3448     SINGLE SCALAR
3449    */
3450 
3451   cmpl(cnt1, 32);
3452   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3453 
3454   // cnt1 >= 32 && generate_vectorized_loop
3455   xorl(index, index);
3456 
3457   // vresult = IntVector.zero(I256);
3458   for (int idx = 0; idx < 4; idx++) {
3459     vpxor(vresult[idx], vresult[idx]);
3460   }
3461   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3462   Register bound = tmp2;
3463   Register next = tmp3;
3464   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3465   movl(next, Address(tmp2, 0));
3466   movdl(vnext, next);
3467   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3468 
3469   // index = 0;
3470   // bound = cnt1 & ~(32 - 1);
3471   movl(bound, cnt1);
3472   andl(bound, ~(32 - 1));
3473   // for (; index < bound; index += 32) {
3474   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3475   // result *= next;
3476   imull(result, next);
3477   // loop fission to upfront the cost of fetching from memory, OOO execution
3478   // can then hopefully do a better job of prefetching
3479   for (int idx = 0; idx < 4; idx++) {
3480     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3481   }
3482   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3483   for (int idx = 0; idx < 4; idx++) {
3484     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3485     arrays_hashcode_elvcast(vtmp[idx], eltype);
3486     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3487   }
3488   // index += 32;
3489   addl(index, 32);
3490   // index < bound;
3491   cmpl(index, bound);
3492   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3493   // }
3494 
3495   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3496   subl(cnt1, bound);
3497   // release bound
3498 
3499   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3500   for (int idx = 0; idx < 4; idx++) {
3501     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3502     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3503     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3504   }
3505   // result += vresult.reduceLanes(ADD);
3506   for (int idx = 0; idx < 4; idx++) {
3507     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3508   }
3509 
3510   // } else if (cnt1 < 32) {
3511 
3512   bind(SHORT_UNROLLED_BEGIN);
3513   // int i = 1;
3514   movl(index, 1);
3515   cmpl(index, cnt1);
3516   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3517 
3518   // for (; i < cnt1 ; i += 2) {
3519   bind(SHORT_UNROLLED_LOOP_BEGIN);
3520   movl(tmp3, 961);
3521   imull(result, tmp3);
3522   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3523   movl(tmp3, tmp2);
3524   shll(tmp3, 5);
3525   subl(tmp3, tmp2);
3526   addl(result, tmp3);
3527   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3528   addl(result, tmp3);
3529   addl(index, 2);
3530   cmpl(index, cnt1);
3531   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3532 
3533   // }
3534   // if (i >= cnt1) {
3535   bind(SHORT_UNROLLED_LOOP_EXIT);
3536   jccb(Assembler::greater, END);
3537   movl(tmp2, result);
3538   shll(result, 5);
3539   subl(result, tmp2);
3540   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3541   addl(result, tmp3);
3542   // }
3543   bind(END);
3544 
3545   BLOCK_COMMENT("} // arrays_hashcode");
3546 
3547 } // arrays_hashcode
3548 
3549 // helper function for string_compare
3550 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3551                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3552                                            Address::ScaleFactor scale2, Register index, int ae) {
3553   if (ae == StrIntrinsicNode::LL) {
3554     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3555     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3556   } else if (ae == StrIntrinsicNode::UU) {
3557     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3558     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3559   } else {
3560     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3561     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3562   }
3563 }
3564 
3565 // Compare strings, used for char[] and byte[].
3566 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3567                                        Register cnt1, Register cnt2, Register result,
3568                                        XMMRegister vec1, int ae, KRegister mask) {
3569   ShortBranchVerifier sbv(this);
3570   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3571   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only AVX3
3572   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3573   int stride2x2 = 0x40;
3574   Address::ScaleFactor scale = Address::no_scale;
3575   Address::ScaleFactor scale1 = Address::no_scale;
3576   Address::ScaleFactor scale2 = Address::no_scale;
3577 
3578   if (ae != StrIntrinsicNode::LL) {
3579     stride2x2 = 0x20;
3580   }
3581 
3582   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3583     shrl(cnt2, 1);
3584   }
3585   // Compute the minimum of the string lengths and the
3586   // difference of the string lengths (stack).
3587   // Do the conditional move stuff
3588   movl(result, cnt1);
3589   subl(cnt1, cnt2);
3590   push(cnt1);
3591   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3592 
3593   // Is the minimum length zero?
3594   testl(cnt2, cnt2);
3595   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3596   if (ae == StrIntrinsicNode::LL) {
3597     // Load first bytes
3598     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3599     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3600   } else if (ae == StrIntrinsicNode::UU) {
3601     // Load first characters
3602     load_unsigned_short(result, Address(str1, 0));
3603     load_unsigned_short(cnt1, Address(str2, 0));
3604   } else {
3605     load_unsigned_byte(result, Address(str1, 0));
3606     load_unsigned_short(cnt1, Address(str2, 0));
3607   }
3608   subl(result, cnt1);
3609   jcc(Assembler::notZero,  POP_LABEL);
3610 
3611   if (ae == StrIntrinsicNode::UU) {
3612     // Divide length by 2 to get number of chars
3613     shrl(cnt2, 1);
3614   }
3615   cmpl(cnt2, 1);
3616   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3617 
3618   // Check if the strings start at the same location and setup scale and stride
3619   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3620     cmpptr(str1, str2);
3621     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3622     if (ae == StrIntrinsicNode::LL) {
3623       scale = Address::times_1;
3624       stride = 16;
3625     } else {
3626       scale = Address::times_2;
3627       stride = 8;
3628     }
3629   } else {
3630     scale1 = Address::times_1;
3631     scale2 = Address::times_2;
3632     // scale not used
3633     stride = 8;
3634   }
3635 
3636   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3637     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3638     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3639     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3640     Label COMPARE_TAIL_LONG;
3641     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only AVX3
3642 
3643     int pcmpmask = 0x19;
3644     if (ae == StrIntrinsicNode::LL) {
3645       pcmpmask &= ~0x01;
3646     }
3647 
3648     // Setup to compare 16-chars (32-bytes) vectors,
3649     // start from first character again because it has aligned address.
3650     if (ae == StrIntrinsicNode::LL) {
3651       stride2 = 32;
3652     } else {
3653       stride2 = 16;
3654     }
3655     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3656       adr_stride = stride << scale;
3657     } else {
3658       adr_stride1 = 8;  //stride << scale1;
3659       adr_stride2 = 16; //stride << scale2;
3660     }
3661 
3662     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3663     // rax and rdx are used by pcmpestri as elements counters
3664     movl(result, cnt2);
3665     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3666     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3667 
3668     // fast path : compare first 2 8-char vectors.
3669     bind(COMPARE_16_CHARS);
3670     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3671       movdqu(vec1, Address(str1, 0));
3672     } else {
3673       pmovzxbw(vec1, Address(str1, 0));
3674     }
3675     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3676     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3677 
3678     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3679       movdqu(vec1, Address(str1, adr_stride));
3680       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3681     } else {
3682       pmovzxbw(vec1, Address(str1, adr_stride1));
3683       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3684     }
3685     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3686     addl(cnt1, stride);
3687 
3688     // Compare the characters at index in cnt1
3689     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3690     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3691     subl(result, cnt2);
3692     jmp(POP_LABEL);
3693 
3694     // Setup the registers to start vector comparison loop
3695     bind(COMPARE_WIDE_VECTORS);
3696     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3697       lea(str1, Address(str1, result, scale));
3698       lea(str2, Address(str2, result, scale));
3699     } else {
3700       lea(str1, Address(str1, result, scale1));
3701       lea(str2, Address(str2, result, scale2));
3702     }
3703     subl(result, stride2);
3704     subl(cnt2, stride2);
3705     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3706     negptr(result);
3707 
3708     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3709     bind(COMPARE_WIDE_VECTORS_LOOP);
3710 
3711     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3712       cmpl(cnt2, stride2x2);
3713       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3714       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3715       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3716 
3717       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3718       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3719         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3720         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3721       } else {
3722         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3723         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3724       }
3725       kortestql(mask, mask);
3726       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3727       addptr(result, stride2x2);  // update since we already compared at this addr
3728       subl(cnt2, stride2x2);      // and sub the size too
3729       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3730 
3731       vpxor(vec1, vec1);
3732       jmpb(COMPARE_WIDE_TAIL);
3733     }//if (VM_Version::supports_avx512vlbw())
3734 
3735     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3736     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3737       vmovdqu(vec1, Address(str1, result, scale));
3738       vpxor(vec1, Address(str2, result, scale));
3739     } else {
3740       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3741       vpxor(vec1, Address(str2, result, scale2));
3742     }
3743     vptest(vec1, vec1);
3744     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3745     addptr(result, stride2);
3746     subl(cnt2, stride2);
3747     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3748     // clean upper bits of YMM registers
3749     vpxor(vec1, vec1);
3750 
3751     // compare wide vectors tail
3752     bind(COMPARE_WIDE_TAIL);
3753     testptr(result, result);
3754     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3755 
3756     movl(result, stride2);
3757     movl(cnt2, result);
3758     negptr(result);
3759     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3760 
3761     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3762     bind(VECTOR_NOT_EQUAL);
3763     // clean upper bits of YMM registers
3764     vpxor(vec1, vec1);
3765     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3766       lea(str1, Address(str1, result, scale));
3767       lea(str2, Address(str2, result, scale));
3768     } else {
3769       lea(str1, Address(str1, result, scale1));
3770       lea(str2, Address(str2, result, scale2));
3771     }
3772     jmp(COMPARE_16_CHARS);
3773 
3774     // Compare tail chars, length between 1 to 15 chars
3775     bind(COMPARE_TAIL_LONG);
3776     movl(cnt2, result);
3777     cmpl(cnt2, stride);
3778     jcc(Assembler::less, COMPARE_SMALL_STR);
3779 
3780     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3781       movdqu(vec1, Address(str1, 0));
3782     } else {
3783       pmovzxbw(vec1, Address(str1, 0));
3784     }
3785     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3786     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3787     subptr(cnt2, stride);
3788     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3789     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3790       lea(str1, Address(str1, result, scale));
3791       lea(str2, Address(str2, result, scale));
3792     } else {
3793       lea(str1, Address(str1, result, scale1));
3794       lea(str2, Address(str2, result, scale2));
3795     }
3796     negptr(cnt2);
3797     jmpb(WHILE_HEAD_LABEL);
3798 
3799     bind(COMPARE_SMALL_STR);
3800   } else if (UseSSE42Intrinsics) {
3801     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3802     int pcmpmask = 0x19;
3803     // Setup to compare 8-char (16-byte) vectors,
3804     // start from first character again because it has aligned address.
3805     movl(result, cnt2);
3806     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3807     if (ae == StrIntrinsicNode::LL) {
3808       pcmpmask &= ~0x01;
3809     }
3810     jcc(Assembler::zero, COMPARE_TAIL);
3811     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3812       lea(str1, Address(str1, result, scale));
3813       lea(str2, Address(str2, result, scale));
3814     } else {
3815       lea(str1, Address(str1, result, scale1));
3816       lea(str2, Address(str2, result, scale2));
3817     }
3818     negptr(result);
3819 
3820     // pcmpestri
3821     //   inputs:
3822     //     vec1- substring
3823     //     rax - negative string length (elements count)
3824     //     mem - scanned string
3825     //     rdx - string length (elements count)
3826     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3827     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3828     //   outputs:
3829     //     rcx - first mismatched element index
3830     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3831 
3832     bind(COMPARE_WIDE_VECTORS);
3833     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3834       movdqu(vec1, Address(str1, result, scale));
3835       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3836     } else {
3837       pmovzxbw(vec1, Address(str1, result, scale1));
3838       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3839     }
3840     // After pcmpestri cnt1(rcx) contains mismatched element index
3841 
3842     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3843     addptr(result, stride);
3844     subptr(cnt2, stride);
3845     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3846 
3847     // compare wide vectors tail
3848     testptr(result, result);
3849     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3850 
3851     movl(cnt2, stride);
3852     movl(result, stride);
3853     negptr(result);
3854     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3855       movdqu(vec1, Address(str1, result, scale));
3856       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3857     } else {
3858       pmovzxbw(vec1, Address(str1, result, scale1));
3859       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3860     }
3861     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3862 
3863     // Mismatched characters in the vectors
3864     bind(VECTOR_NOT_EQUAL);
3865     addptr(cnt1, result);
3866     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3867     subl(result, cnt2);
3868     jmpb(POP_LABEL);
3869 
3870     bind(COMPARE_TAIL); // limit is zero
3871     movl(cnt2, result);
3872     // Fallthru to tail compare
3873   }
3874   // Shift str2 and str1 to the end of the arrays, negate min
3875   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3876     lea(str1, Address(str1, cnt2, scale));
3877     lea(str2, Address(str2, cnt2, scale));
3878   } else {
3879     lea(str1, Address(str1, cnt2, scale1));
3880     lea(str2, Address(str2, cnt2, scale2));
3881   }
3882   decrementl(cnt2);  // first character was compared already
3883   negptr(cnt2);
3884 
3885   // Compare the rest of the elements
3886   bind(WHILE_HEAD_LABEL);
3887   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3888   subl(result, cnt1);
3889   jccb(Assembler::notZero, POP_LABEL);
3890   increment(cnt2);
3891   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3892 
3893   // Strings are equal up to min length.  Return the length difference.
3894   bind(LENGTH_DIFF_LABEL);
3895   pop(result);
3896   if (ae == StrIntrinsicNode::UU) {
3897     // Divide diff by 2 to get number of chars
3898     sarl(result, 1);
3899   }
3900   jmpb(DONE_LABEL);
3901 
3902   if (VM_Version::supports_avx512vlbw()) {
3903 
3904     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3905 
3906     kmovql(cnt1, mask);
3907     notq(cnt1);
3908     bsfq(cnt2, cnt1);
3909     if (ae != StrIntrinsicNode::LL) {
3910       // Divide diff by 2 to get number of chars
3911       sarl(cnt2, 1);
3912     }
3913     addq(result, cnt2);
3914     if (ae == StrIntrinsicNode::LL) {
3915       load_unsigned_byte(cnt1, Address(str2, result));
3916       load_unsigned_byte(result, Address(str1, result));
3917     } else if (ae == StrIntrinsicNode::UU) {
3918       load_unsigned_short(cnt1, Address(str2, result, scale));
3919       load_unsigned_short(result, Address(str1, result, scale));
3920     } else {
3921       load_unsigned_short(cnt1, Address(str2, result, scale2));
3922       load_unsigned_byte(result, Address(str1, result, scale1));
3923     }
3924     subl(result, cnt1);
3925     jmpb(POP_LABEL);
3926   }//if (VM_Version::supports_avx512vlbw())
3927 
3928   // Discard the stored length difference
3929   bind(POP_LABEL);
3930   pop(cnt1);
3931 
3932   // That's it
3933   bind(DONE_LABEL);
3934   if(ae == StrIntrinsicNode::UL) {
3935     negl(result);
3936   }
3937 
3938 }
3939 
3940 // Search for Non-ASCII character (Negative byte value) in a byte array,
3941 // return the index of the first such character, otherwise the length
3942 // of the array segment searched.
3943 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3944 //   @IntrinsicCandidate
3945 //   public static int countPositives(byte[] ba, int off, int len) {
3946 //     for (int i = off; i < off + len; i++) {
3947 //       if (ba[i] < 0) {
3948 //         return i - off;
3949 //       }
3950 //     }
3951 //     return len;
3952 //   }
3953 void C2_MacroAssembler::count_positives(Register ary1, Register len,
3954   Register result, Register tmp1,
3955   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3956   // rsi: byte array
3957   // rcx: len
3958   // rax: result
3959   ShortBranchVerifier sbv(this);
3960   assert_different_registers(ary1, len, result, tmp1);
3961   assert_different_registers(vec1, vec2);
3962   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3963 
3964   movl(result, len); // copy
3965   // len == 0
3966   testl(len, len);
3967   jcc(Assembler::zero, DONE);
3968 
3969   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3970     VM_Version::supports_avx512vlbw() &&
3971     VM_Version::supports_bmi2()) {
3972 
3973     Label test_64_loop, test_tail, BREAK_LOOP;
3974     movl(tmp1, len);
3975     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3976 
3977     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
3978     andl(len,  0xffffffc0); // vector count (in chars)
3979     jccb(Assembler::zero, test_tail);
3980 
3981     lea(ary1, Address(ary1, len, Address::times_1));
3982     negptr(len);
3983 
3984     bind(test_64_loop);
3985     // Check whether our 64 elements of size byte contain negatives
3986     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3987     kortestql(mask1, mask1);
3988     jcc(Assembler::notZero, BREAK_LOOP);
3989 
3990     addptr(len, 64);
3991     jccb(Assembler::notZero, test_64_loop);
3992 
3993     bind(test_tail);
3994     // bail out when there is nothing to be done
3995     testl(tmp1, -1);
3996     jcc(Assembler::zero, DONE);
3997 
3998 
3999     // check the tail for absense of negatives
4000     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4001     {
4002       Register tmp3_aliased = len;
4003       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4004       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4005       notq(tmp3_aliased);
4006       kmovql(mask2, tmp3_aliased);
4007     }
4008 
4009     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4010     ktestq(mask1, mask2);
4011     jcc(Assembler::zero, DONE);
4012 
4013     // do a full check for negative registers in the tail
4014     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4015                      // ary1 already pointing to the right place
4016     jmpb(TAIL_START);
4017 
4018     bind(BREAK_LOOP);
4019     // At least one byte in the last 64 byte block was negative.
4020     // Set up to look at the last 64 bytes as if they were a tail
4021     lea(ary1, Address(ary1, len, Address::times_1));
4022     addptr(result, len);
4023     // Ignore the very last byte: if all others are positive,
4024     // it must be negative, so we can skip right to the 2+1 byte
4025     // end comparison at this point
4026     orl(result, 63);
4027     movl(len, 63);
4028     // Fallthru to tail compare
4029   } else {
4030 
4031     if (UseAVX >= 2) {
4032       // With AVX2, use 32-byte vector compare
4033       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4034 
4035       // Compare 32-byte vectors
4036       testl(len, 0xffffffe0);   // vector count (in bytes)
4037       jccb(Assembler::zero, TAIL_START);
4038 
4039       andl(len, 0xffffffe0);
4040       lea(ary1, Address(ary1, len, Address::times_1));
4041       negptr(len);
4042 
4043       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4044       movdl(vec2, tmp1);
4045       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4046 
4047       bind(COMPARE_WIDE_VECTORS);
4048       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4049       vptest(vec1, vec2);
4050       jccb(Assembler::notZero, BREAK_LOOP);
4051       addptr(len, 32);
4052       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4053 
4054       testl(result, 0x0000001f);   // any bytes remaining?
4055       jcc(Assembler::zero, DONE);
4056 
4057       // Quick test using the already prepared vector mask
4058       movl(len, result);
4059       andl(len, 0x0000001f);
4060       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4061       vptest(vec1, vec2);
4062       jcc(Assembler::zero, DONE);
4063       // There are zeros, jump to the tail to determine exactly where
4064       jmpb(TAIL_START);
4065 
4066       bind(BREAK_LOOP);
4067       // At least one byte in the last 32-byte vector is negative.
4068       // Set up to look at the last 32 bytes as if they were a tail
4069       lea(ary1, Address(ary1, len, Address::times_1));
4070       addptr(result, len);
4071       // Ignore the very last byte: if all others are positive,
4072       // it must be negative, so we can skip right to the 2+1 byte
4073       // end comparison at this point
4074       orl(result, 31);
4075       movl(len, 31);
4076       // Fallthru to tail compare
4077     } else if (UseSSE42Intrinsics) {
4078       // With SSE4.2, use double quad vector compare
4079       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4080 
4081       // Compare 16-byte vectors
4082       testl(len, 0xfffffff0);   // vector count (in bytes)
4083       jcc(Assembler::zero, TAIL_START);
4084 
4085       andl(len, 0xfffffff0);
4086       lea(ary1, Address(ary1, len, Address::times_1));
4087       negptr(len);
4088 
4089       movl(tmp1, 0x80808080);
4090       movdl(vec2, tmp1);
4091       pshufd(vec2, vec2, 0);
4092 
4093       bind(COMPARE_WIDE_VECTORS);
4094       movdqu(vec1, Address(ary1, len, Address::times_1));
4095       ptest(vec1, vec2);
4096       jccb(Assembler::notZero, BREAK_LOOP);
4097       addptr(len, 16);
4098       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4099 
4100       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4101       jcc(Assembler::zero, DONE);
4102 
4103       // Quick test using the already prepared vector mask
4104       movl(len, result);
4105       andl(len, 0x0000000f);   // tail count (in bytes)
4106       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4107       ptest(vec1, vec2);
4108       jcc(Assembler::zero, DONE);
4109       jmpb(TAIL_START);
4110 
4111       bind(BREAK_LOOP);
4112       // At least one byte in the last 16-byte vector is negative.
4113       // Set up and look at the last 16 bytes as if they were a tail
4114       lea(ary1, Address(ary1, len, Address::times_1));
4115       addptr(result, len);
4116       // Ignore the very last byte: if all others are positive,
4117       // it must be negative, so we can skip right to the 2+1 byte
4118       // end comparison at this point
4119       orl(result, 15);
4120       movl(len, 15);
4121       // Fallthru to tail compare
4122     }
4123   }
4124 
4125   bind(TAIL_START);
4126   // Compare 4-byte vectors
4127   andl(len, 0xfffffffc); // vector count (in bytes)
4128   jccb(Assembler::zero, COMPARE_CHAR);
4129 
4130   lea(ary1, Address(ary1, len, Address::times_1));
4131   negptr(len);
4132 
4133   bind(COMPARE_VECTORS);
4134   movl(tmp1, Address(ary1, len, Address::times_1));
4135   andl(tmp1, 0x80808080);
4136   jccb(Assembler::notZero, TAIL_ADJUST);
4137   addptr(len, 4);
4138   jccb(Assembler::notZero, COMPARE_VECTORS);
4139 
4140   // Compare trailing char (final 2-3 bytes), if any
4141   bind(COMPARE_CHAR);
4142 
4143   testl(result, 0x2);   // tail  char
4144   jccb(Assembler::zero, COMPARE_BYTE);
4145   load_unsigned_short(tmp1, Address(ary1, 0));
4146   andl(tmp1, 0x00008080);
4147   jccb(Assembler::notZero, CHAR_ADJUST);
4148   lea(ary1, Address(ary1, 2));
4149 
4150   bind(COMPARE_BYTE);
4151   testl(result, 0x1);   // tail  byte
4152   jccb(Assembler::zero, DONE);
4153   load_unsigned_byte(tmp1, Address(ary1, 0));
4154   testl(tmp1, 0x00000080);
4155   jccb(Assembler::zero, DONE);
4156   subptr(result, 1);
4157   jmpb(DONE);
4158 
4159   bind(TAIL_ADJUST);
4160   // there are negative bits in the last 4 byte block.
4161   // Adjust result and check the next three bytes
4162   addptr(result, len);
4163   orl(result, 3);
4164   lea(ary1, Address(ary1, len, Address::times_1));
4165   jmpb(COMPARE_CHAR);
4166 
4167   bind(CHAR_ADJUST);
4168   // We are looking at a char + optional byte tail, and found that one
4169   // of the bytes in the char is negative. Adjust the result, check the
4170   // first byte and readjust if needed.
4171   andl(result, 0xfffffffc);
4172   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4173   jccb(Assembler::notZero, DONE);
4174   addptr(result, 1);
4175 
4176   // That's it
4177   bind(DONE);
4178   if (UseAVX >= 2) {
4179     // clean upper bits of YMM registers
4180     vpxor(vec1, vec1);
4181     vpxor(vec2, vec2);
4182   }
4183 }
4184 
4185 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4186 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4187                                       Register limit, Register result, Register chr,
4188                                       XMMRegister vec1, XMMRegister vec2, bool is_char,
4189                                       KRegister mask, bool expand_ary2) {
4190   // for expand_ary2, limit is the (smaller) size of the second array.
4191   ShortBranchVerifier sbv(this);
4192   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4193 
4194   assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4195          "Expansion only implemented for AVX2");
4196 
4197   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4198   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4199 
4200   Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4201   int scaleIncr = expand_ary2 ? 8 : 16;
4202 
4203   if (is_array_equ) {
4204     // Check the input args
4205     cmpoop(ary1, ary2);
4206     jcc(Assembler::equal, TRUE_LABEL);
4207 
4208     // Need additional checks for arrays_equals.
4209     testptr(ary1, ary1);
4210     jcc(Assembler::zero, FALSE_LABEL);
4211     testptr(ary2, ary2);
4212     jcc(Assembler::zero, FALSE_LABEL);
4213 
4214     // Check the lengths
4215     movl(limit, Address(ary1, length_offset));
4216     cmpl(limit, Address(ary2, length_offset));
4217     jcc(Assembler::notEqual, FALSE_LABEL);
4218   }
4219 
4220   // count == 0
4221   testl(limit, limit);
4222   jcc(Assembler::zero, TRUE_LABEL);
4223 
4224   if (is_array_equ) {
4225     // Load array address
4226     lea(ary1, Address(ary1, base_offset));
4227     lea(ary2, Address(ary2, base_offset));
4228   }
4229 
4230   if (is_array_equ && is_char) {
4231     // arrays_equals when used for char[].
4232     shll(limit, 1);      // byte count != 0
4233   }
4234   movl(result, limit); // copy
4235 
4236   if (UseAVX >= 2) {
4237     // With AVX2, use 32-byte vector compare
4238     Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4239 
4240     // Compare 32-byte vectors
4241     if (expand_ary2) {
4242       andl(result, 0x0000000f);  //   tail count (in bytes)
4243       andl(limit, 0xfffffff0);   // vector count (in bytes)
4244       jcc(Assembler::zero, COMPARE_TAIL);
4245     } else {
4246       andl(result, 0x0000001f);  //   tail count (in bytes)
4247       andl(limit, 0xffffffe0);   // vector count (in bytes)
4248       jcc(Assembler::zero, COMPARE_TAIL_16);
4249     }
4250 
4251     lea(ary1, Address(ary1, limit, scaleFactor));
4252     lea(ary2, Address(ary2, limit, Address::times_1));
4253     negptr(limit);
4254 
4255     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4256       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4257 
4258       cmpl(limit, -64);
4259       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4260 
4261       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4262 
4263       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4264       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4265       kortestql(mask, mask);
4266       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4267       addptr(limit, 64);  // update since we already compared at this addr
4268       cmpl(limit, -64);
4269       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4270 
4271       // At this point we may still need to compare -limit+result bytes.
4272       // We could execute the next two instruction and just continue via non-wide path:
4273       //  cmpl(limit, 0);
4274       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4275       // But since we stopped at the points ary{1,2}+limit which are
4276       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4277       // (|limit| <= 32 and result < 32),
4278       // we may just compare the last 64 bytes.
4279       //
4280       addptr(result, -64);   // it is safe, bc we just came from this area
4281       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4282       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4283       kortestql(mask, mask);
4284       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4285 
4286       jmp(TRUE_LABEL);
4287 
4288       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4289 
4290     }//if (VM_Version::supports_avx512vlbw())
4291 
4292     bind(COMPARE_WIDE_VECTORS);
4293     vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4294     if (expand_ary2) {
4295       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4296     } else {
4297       vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4298     }
4299     vpxor(vec1, vec2);
4300 
4301     vptest(vec1, vec1);
4302     jcc(Assembler::notZero, FALSE_LABEL);
4303     addptr(limit, scaleIncr * 2);
4304     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4305 
4306     testl(result, result);
4307     jcc(Assembler::zero, TRUE_LABEL);
4308 
4309     vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4310     if (expand_ary2) {
4311       vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4312     } else {
4313       vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4314     }
4315     vpxor(vec1, vec2);
4316 
4317     vptest(vec1, vec1);
4318     jcc(Assembler::notZero, FALSE_LABEL);
4319     jmp(TRUE_LABEL);
4320 
4321     bind(COMPARE_TAIL_16); // limit is zero
4322     movl(limit, result);
4323 
4324     // Compare 16-byte chunks
4325     andl(result, 0x0000000f);  //   tail count (in bytes)
4326     andl(limit, 0xfffffff0);   // vector count (in bytes)
4327     jcc(Assembler::zero, COMPARE_TAIL);
4328 
4329     lea(ary1, Address(ary1, limit, scaleFactor));
4330     lea(ary2, Address(ary2, limit, Address::times_1));
4331     negptr(limit);
4332 
4333     bind(COMPARE_WIDE_VECTORS_16);
4334     movdqu(vec1, Address(ary1, limit, scaleFactor));
4335     if (expand_ary2) {
4336       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4337     } else {
4338       movdqu(vec2, Address(ary2, limit, Address::times_1));
4339     }
4340     pxor(vec1, vec2);
4341 
4342     ptest(vec1, vec1);
4343     jcc(Assembler::notZero, FALSE_LABEL);
4344     addptr(limit, scaleIncr);
4345     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4346 
4347     bind(COMPARE_TAIL); // limit is zero
4348     movl(limit, result);
4349     // Fallthru to tail compare
4350   } else if (UseSSE42Intrinsics) {
4351     // With SSE4.2, use double quad vector compare
4352     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4353 
4354     // Compare 16-byte vectors
4355     andl(result, 0x0000000f);  //   tail count (in bytes)
4356     andl(limit, 0xfffffff0);   // vector count (in bytes)
4357     jcc(Assembler::zero, COMPARE_TAIL);
4358 
4359     lea(ary1, Address(ary1, limit, Address::times_1));
4360     lea(ary2, Address(ary2, limit, Address::times_1));
4361     negptr(limit);
4362 
4363     bind(COMPARE_WIDE_VECTORS);
4364     movdqu(vec1, Address(ary1, limit, Address::times_1));
4365     movdqu(vec2, Address(ary2, limit, Address::times_1));
4366     pxor(vec1, vec2);
4367 
4368     ptest(vec1, vec1);
4369     jcc(Assembler::notZero, FALSE_LABEL);
4370     addptr(limit, 16);
4371     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4372 
4373     testl(result, result);
4374     jcc(Assembler::zero, TRUE_LABEL);
4375 
4376     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4377     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4378     pxor(vec1, vec2);
4379 
4380     ptest(vec1, vec1);
4381     jccb(Assembler::notZero, FALSE_LABEL);
4382     jmpb(TRUE_LABEL);
4383 
4384     bind(COMPARE_TAIL); // limit is zero
4385     movl(limit, result);
4386     // Fallthru to tail compare
4387   }
4388 
4389   // Compare 4-byte vectors
4390   if (expand_ary2) {
4391     testl(result, result);
4392     jccb(Assembler::zero, TRUE_LABEL);
4393   } else {
4394     andl(limit, 0xfffffffc); // vector count (in bytes)
4395     jccb(Assembler::zero, COMPARE_CHAR);
4396   }
4397 
4398   lea(ary1, Address(ary1, limit, scaleFactor));
4399   lea(ary2, Address(ary2, limit, Address::times_1));
4400   negptr(limit);
4401 
4402   bind(COMPARE_VECTORS);
4403   if (expand_ary2) {
4404     // There are no "vector" operations for bytes to shorts
4405     movzbl(chr, Address(ary2, limit, Address::times_1));
4406     cmpw(Address(ary1, limit, Address::times_2), chr);
4407     jccb(Assembler::notEqual, FALSE_LABEL);
4408     addptr(limit, 1);
4409     jcc(Assembler::notZero, COMPARE_VECTORS);
4410     jmp(TRUE_LABEL);
4411   } else {
4412     movl(chr, Address(ary1, limit, Address::times_1));
4413     cmpl(chr, Address(ary2, limit, Address::times_1));
4414     jccb(Assembler::notEqual, FALSE_LABEL);
4415     addptr(limit, 4);
4416     jcc(Assembler::notZero, COMPARE_VECTORS);
4417   }
4418 
4419   // Compare trailing char (final 2 bytes), if any
4420   bind(COMPARE_CHAR);
4421   testl(result, 0x2);   // tail  char
4422   jccb(Assembler::zero, COMPARE_BYTE);
4423   load_unsigned_short(chr, Address(ary1, 0));
4424   load_unsigned_short(limit, Address(ary2, 0));
4425   cmpl(chr, limit);
4426   jccb(Assembler::notEqual, FALSE_LABEL);
4427 
4428   if (is_array_equ && is_char) {
4429     bind(COMPARE_BYTE);
4430   } else {
4431     lea(ary1, Address(ary1, 2));
4432     lea(ary2, Address(ary2, 2));
4433 
4434     bind(COMPARE_BYTE);
4435     testl(result, 0x1);   // tail  byte
4436     jccb(Assembler::zero, TRUE_LABEL);
4437     load_unsigned_byte(chr, Address(ary1, 0));
4438     load_unsigned_byte(limit, Address(ary2, 0));
4439     cmpl(chr, limit);
4440     jccb(Assembler::notEqual, FALSE_LABEL);
4441   }
4442   bind(TRUE_LABEL);
4443   movl(result, 1);   // return true
4444   jmpb(DONE);
4445 
4446   bind(FALSE_LABEL);
4447   xorl(result, result); // return false
4448 
4449   // That's it
4450   bind(DONE);
4451   if (UseAVX >= 2) {
4452     // clean upper bits of YMM registers
4453     vpxor(vec1, vec1);
4454     vpxor(vec2, vec2);
4455   }
4456 }
4457 
4458 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4459 #define __ masm.
4460   Register dst = stub.data<0>();
4461   XMMRegister src = stub.data<1>();
4462   address target = stub.data<2>();
4463   __ bind(stub.entry());
4464   __ subptr(rsp, 8);
4465   __ movdbl(Address(rsp), src);
4466   __ call(RuntimeAddress(target));
4467   // APX REX2 encoding for pop(dst) increases the stub size by 1 byte.
4468   __ pop(dst);
4469   __ jmp(stub.continuation());
4470 #undef __
4471 }
4472 
4473 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4474   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4475   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4476 
4477   address slowpath_target;
4478   if (dst_bt == T_INT) {
4479     if (src_bt == T_FLOAT) {
4480       cvttss2sil(dst, src);
4481       cmpl(dst, 0x80000000);
4482       slowpath_target = StubRoutines::x86::f2i_fixup();
4483     } else {
4484       cvttsd2sil(dst, src);
4485       cmpl(dst, 0x80000000);
4486       slowpath_target = StubRoutines::x86::d2i_fixup();
4487     }
4488   } else {
4489     if (src_bt == T_FLOAT) {
4490       cvttss2siq(dst, src);
4491       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4492       slowpath_target = StubRoutines::x86::f2l_fixup();
4493     } else {
4494       cvttsd2siq(dst, src);
4495       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4496       slowpath_target = StubRoutines::x86::d2l_fixup();
4497     }
4498   }
4499 
4500   // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte.
4501   int max_size = 23 + (UseAPX ? 1 : 0);
4502   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath);
4503   jcc(Assembler::equal, stub->entry());
4504   bind(stub->continuation());
4505 }
4506 
4507 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4508                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4509   switch(ideal_opc) {
4510     case Op_LShiftVS:
4511       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4512     case Op_LShiftVI:
4513       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4514     case Op_LShiftVL:
4515       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4516     case Op_RShiftVS:
4517       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4518     case Op_RShiftVI:
4519       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4520     case Op_RShiftVL:
4521       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4522     case Op_URShiftVS:
4523       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4524     case Op_URShiftVI:
4525       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4526     case Op_URShiftVL:
4527       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4528     case Op_RotateRightV:
4529       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4530     case Op_RotateLeftV:
4531       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4532     default:
4533       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4534       break;
4535   }
4536 }
4537 
4538 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4539                                                XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4540   if (is_unsigned) {
4541     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4542   } else {
4543     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4544   }
4545 }
4546 
4547 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4548                                                       XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4549   switch (elem_bt) {
4550     case T_BYTE:
4551       if (ideal_opc == Op_SaturatingAddV) {
4552         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4553       } else {
4554         assert(ideal_opc == Op_SaturatingSubV, "");
4555         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4556       }
4557       break;
4558     case T_SHORT:
4559       if (ideal_opc == Op_SaturatingAddV) {
4560         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4561       } else {
4562         assert(ideal_opc == Op_SaturatingSubV, "");
4563         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4564       }
4565       break;
4566     default:
4567       fatal("Unsupported type %s", type2name(elem_bt));
4568       break;
4569   }
4570 }
4571 
4572 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4573                                                         XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4574   switch (elem_bt) {
4575     case T_BYTE:
4576       if (ideal_opc == Op_SaturatingAddV) {
4577         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4578       } else {
4579         assert(ideal_opc == Op_SaturatingSubV, "");
4580         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4581       }
4582       break;
4583     case T_SHORT:
4584       if (ideal_opc == Op_SaturatingAddV) {
4585         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4586       } else {
4587         assert(ideal_opc == Op_SaturatingSubV, "");
4588         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4589       }
4590       break;
4591     default:
4592       fatal("Unsupported type %s", type2name(elem_bt));
4593       break;
4594   }
4595 }
4596 
4597 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4598                                                Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4599   if (is_unsigned) {
4600     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4601   } else {
4602     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4603   }
4604 }
4605 
4606 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4607                                                       XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4608   switch (elem_bt) {
4609     case T_BYTE:
4610       if (ideal_opc == Op_SaturatingAddV) {
4611         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4612       } else {
4613         assert(ideal_opc == Op_SaturatingSubV, "");
4614         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4615       }
4616       break;
4617     case T_SHORT:
4618       if (ideal_opc == Op_SaturatingAddV) {
4619         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4620       } else {
4621         assert(ideal_opc == Op_SaturatingSubV, "");
4622         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4623       }
4624       break;
4625     default:
4626       fatal("Unsupported type %s", type2name(elem_bt));
4627       break;
4628   }
4629 }
4630 
4631 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4632                                                         XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4633   switch (elem_bt) {
4634     case T_BYTE:
4635       if (ideal_opc == Op_SaturatingAddV) {
4636         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4637       } else {
4638         assert(ideal_opc == Op_SaturatingSubV, "");
4639         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4640       }
4641       break;
4642     case T_SHORT:
4643       if (ideal_opc == Op_SaturatingAddV) {
4644         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4645       } else {
4646         assert(ideal_opc == Op_SaturatingSubV, "");
4647         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4648       }
4649       break;
4650     default:
4651       fatal("Unsupported type %s", type2name(elem_bt));
4652       break;
4653   }
4654 }
4655 
4656 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4657                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4658                                     bool is_varshift) {
4659   switch (ideal_opc) {
4660     case Op_AddVB:
4661       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4662     case Op_AddVS:
4663       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4664     case Op_AddVI:
4665       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4666     case Op_AddVL:
4667       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4668     case Op_AddVF:
4669       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4670     case Op_AddVD:
4671       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4672     case Op_SubVB:
4673       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4674     case Op_SubVS:
4675       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4676     case Op_SubVI:
4677       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4678     case Op_SubVL:
4679       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4680     case Op_SubVF:
4681       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4682     case Op_SubVD:
4683       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4684     case Op_MulVS:
4685       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4686     case Op_MulVI:
4687       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4688     case Op_MulVL:
4689       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4690     case Op_MulVF:
4691       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4692     case Op_MulVD:
4693       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4694     case Op_DivVF:
4695       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4696     case Op_DivVD:
4697       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4698     case Op_SqrtVF:
4699       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4700     case Op_SqrtVD:
4701       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4702     case Op_AbsVB:
4703       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4704     case Op_AbsVS:
4705       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4706     case Op_AbsVI:
4707       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4708     case Op_AbsVL:
4709       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4710     case Op_FmaVF:
4711       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4712     case Op_FmaVD:
4713       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4714     case Op_VectorRearrange:
4715       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4716     case Op_LShiftVS:
4717       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4718     case Op_LShiftVI:
4719       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4720     case Op_LShiftVL:
4721       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4722     case Op_RShiftVS:
4723       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4724     case Op_RShiftVI:
4725       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4726     case Op_RShiftVL:
4727       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4728     case Op_URShiftVS:
4729       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4730     case Op_URShiftVI:
4731       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4732     case Op_URShiftVL:
4733       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4734     case Op_RotateLeftV:
4735       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4736     case Op_RotateRightV:
4737       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4738     case Op_MaxV:
4739       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4740     case Op_MinV:
4741       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4742     case Op_UMinV:
4743       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4744     case Op_UMaxV:
4745       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4746     case Op_XorV:
4747       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4748     case Op_OrV:
4749       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4750     case Op_AndV:
4751       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4752     default:
4753       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4754       break;
4755   }
4756 }
4757 
4758 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4759                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4760   switch (ideal_opc) {
4761     case Op_AddVB:
4762       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4763     case Op_AddVS:
4764       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4765     case Op_AddVI:
4766       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4767     case Op_AddVL:
4768       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4769     case Op_AddVF:
4770       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4771     case Op_AddVD:
4772       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4773     case Op_SubVB:
4774       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4775     case Op_SubVS:
4776       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4777     case Op_SubVI:
4778       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4779     case Op_SubVL:
4780       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4781     case Op_SubVF:
4782       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4783     case Op_SubVD:
4784       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4785     case Op_MulVS:
4786       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4787     case Op_MulVI:
4788       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4789     case Op_MulVL:
4790       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4791     case Op_MulVF:
4792       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4793     case Op_MulVD:
4794       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4795     case Op_DivVF:
4796       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4797     case Op_DivVD:
4798       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4799     case Op_FmaVF:
4800       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4801     case Op_FmaVD:
4802       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4803     case Op_MaxV:
4804       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4805     case Op_MinV:
4806       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4807     case Op_UMaxV:
4808       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4809     case Op_UMinV:
4810       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4811     case Op_XorV:
4812       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4813     case Op_OrV:
4814       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4815     case Op_AndV:
4816       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4817     default:
4818       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4819       break;
4820   }
4821 }
4822 
4823 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4824                                   KRegister src1, KRegister src2) {
4825   BasicType etype = T_ILLEGAL;
4826   switch(mask_len) {
4827     case 2:
4828     case 4:
4829     case 8:  etype = T_BYTE; break;
4830     case 16: etype = T_SHORT; break;
4831     case 32: etype = T_INT; break;
4832     case 64: etype = T_LONG; break;
4833     default: fatal("Unsupported type"); break;
4834   }
4835   assert(etype != T_ILLEGAL, "");
4836   switch(ideal_opc) {
4837     case Op_AndVMask:
4838       kand(etype, dst, src1, src2); break;
4839     case Op_OrVMask:
4840       kor(etype, dst, src1, src2); break;
4841     case Op_XorVMask:
4842       kxor(etype, dst, src1, src2); break;
4843     default:
4844       fatal("Unsupported masked operation"); break;
4845   }
4846 }
4847 
4848 /*
4849  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4850  * If src is NaN, the result is 0.
4851  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4852  * the result is equal to the value of Integer.MIN_VALUE.
4853  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4854  * the result is equal to the value of Integer.MAX_VALUE.
4855  */
4856 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4857                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4858                                                                    Register rscratch, AddressLiteral float_sign_flip,
4859                                                                    int vec_enc) {
4860   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4861   Label done;
4862   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4863   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4864   vptest(xtmp2, xtmp2, vec_enc);
4865   jccb(Assembler::equal, done);
4866 
4867   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4868   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4869 
4870   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4871   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4872   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4873 
4874   // Recompute the mask for remaining special value.
4875   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4876   // Extract SRC values corresponding to TRUE mask lanes.
4877   vpand(xtmp4, xtmp2, src, vec_enc);
4878   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4879   // values are set.
4880   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4881 
4882   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4883   bind(done);
4884 }
4885 
4886 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4887                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4888                                                                     Register rscratch, AddressLiteral float_sign_flip,
4889                                                                     int vec_enc) {
4890   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4891   Label done;
4892   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4893   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4894   kortestwl(ktmp1, ktmp1);
4895   jccb(Assembler::equal, done);
4896 
4897   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4898   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4899   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4900 
4901   kxorwl(ktmp1, ktmp1, ktmp2);
4902   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4903   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4904   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4905   bind(done);
4906 }
4907 
4908 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4909                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4910                                                                      Register rscratch, AddressLiteral double_sign_flip,
4911                                                                      int vec_enc) {
4912   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4913 
4914   Label done;
4915   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4916   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4917   kortestwl(ktmp1, ktmp1);
4918   jccb(Assembler::equal, done);
4919 
4920   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4921   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4922   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4923 
4924   kxorwl(ktmp1, ktmp1, ktmp2);
4925   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4926   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4927   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4928   bind(done);
4929 }
4930 
4931 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4932                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4933                                                                      Register rscratch, AddressLiteral float_sign_flip,
4934                                                                      int vec_enc) {
4935   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4936   Label done;
4937   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4938   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4939   kortestwl(ktmp1, ktmp1);
4940   jccb(Assembler::equal, done);
4941 
4942   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4943   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4944   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4945 
4946   kxorwl(ktmp1, ktmp1, ktmp2);
4947   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4948   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4949   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4950   bind(done);
4951 }
4952 
4953 /*
4954  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4955  * If src is NaN, the result is 0.
4956  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
4957  * the result is equal to the value of Long.MIN_VALUE.
4958  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
4959  * the result is equal to the value of Long.MAX_VALUE.
4960  */
4961 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4962                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4963                                                                       Register rscratch, AddressLiteral double_sign_flip,
4964                                                                       int vec_enc) {
4965   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4966 
4967   Label done;
4968   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4969   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
4970   kortestwl(ktmp1, ktmp1);
4971   jccb(Assembler::equal, done);
4972 
4973   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4974   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4975   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4976 
4977   kxorwl(ktmp1, ktmp1, ktmp2);
4978   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4979   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4980   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4981   bind(done);
4982 }
4983 
4984 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
4985                                                              XMMRegister xtmp, int index, int vec_enc) {
4986    assert(vec_enc < Assembler::AVX_512bit, "");
4987    if (vec_enc == Assembler::AVX_256bit) {
4988      vextractf128_high(xtmp, src);
4989      vshufps(dst, src, xtmp, index, vec_enc);
4990    } else {
4991      vshufps(dst, src, zero, index, vec_enc);
4992    }
4993 }
4994 
4995 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4996                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
4997                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
4998   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4999 
5000   Label done;
5001   // Compare the destination lanes with float_sign_flip
5002   // value to get mask for all special values.
5003   movdqu(xtmp1, float_sign_flip, rscratch);
5004   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5005   ptest(xtmp2, xtmp2);
5006   jccb(Assembler::equal, done);
5007 
5008   // Flip float_sign_flip to get max integer value.
5009   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5010   pxor(xtmp1, xtmp4);
5011 
5012   // Set detination lanes corresponding to unordered source lanes as zero.
5013   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5014   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5015 
5016   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5017   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5018   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5019 
5020   // Recompute the mask for remaining special value.
5021   pxor(xtmp2, xtmp3);
5022   // Extract mask corresponding to non-negative source lanes.
5023   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5024 
5025   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5026   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5027   pand(xtmp3, xtmp2);
5028 
5029   // Replace destination lanes holding special value(0x80000000) with max int
5030   // if corresponding source lane holds a +ve value.
5031   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5032   bind(done);
5033 }
5034 
5035 
5036 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5037                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5038   switch(to_elem_bt) {
5039     case T_SHORT:
5040       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5041       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5042       vpackusdw(dst, dst, zero, vec_enc);
5043       if (vec_enc == Assembler::AVX_256bit) {
5044         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5045       }
5046       break;
5047     case  T_BYTE:
5048       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5049       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5050       vpackusdw(dst, dst, zero, vec_enc);
5051       if (vec_enc == Assembler::AVX_256bit) {
5052         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5053       }
5054       vpackuswb(dst, dst, zero, vec_enc);
5055       break;
5056     default: assert(false, "%s", type2name(to_elem_bt));
5057   }
5058 }
5059 
5060 /*
5061  * Algorithm for vector D2L and F2I conversions:-
5062  * a) Perform vector D2L/F2I cast.
5063  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5064  *    It signifies that source value could be any of the special floating point
5065  *    values(NaN,-Inf,Inf,Max,-Min).
5066  * c) Set destination to zero if source is NaN value.
5067  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5068  */
5069 
5070 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5071                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5072                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5073   int to_elem_sz = type2aelembytes(to_elem_bt);
5074   assert(to_elem_sz <= 4, "");
5075   vcvttps2dq(dst, src, vec_enc);
5076   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5077   if (to_elem_sz < 4) {
5078     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5079     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5080   }
5081 }
5082 
5083 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5084                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5085                                             Register rscratch, int vec_enc) {
5086   int to_elem_sz = type2aelembytes(to_elem_bt);
5087   assert(to_elem_sz <= 4, "");
5088   vcvttps2dq(dst, src, vec_enc);
5089   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5090   switch(to_elem_bt) {
5091     case T_INT:
5092       break;
5093     case T_SHORT:
5094       evpmovdw(dst, dst, vec_enc);
5095       break;
5096     case T_BYTE:
5097       evpmovdb(dst, dst, vec_enc);
5098       break;
5099     default: assert(false, "%s", type2name(to_elem_bt));
5100   }
5101 }
5102 
5103 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5104                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5105                                             Register rscratch, int vec_enc) {
5106   evcvttps2qq(dst, src, vec_enc);
5107   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5108 }
5109 
5110 // Handling for downcasting from double to integer or sub-word types on AVX2.
5111 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5112                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5113                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5114   int to_elem_sz = type2aelembytes(to_elem_bt);
5115   assert(to_elem_sz < 8, "");
5116   vcvttpd2dq(dst, src, vec_enc);
5117   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5118                                               float_sign_flip, vec_enc);
5119   if (to_elem_sz < 4) {
5120     // xtmp4 holds all zero lanes.
5121     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5122   }
5123 }
5124 
5125 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5126                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5127                                             KRegister ktmp2, AddressLiteral sign_flip,
5128                                             Register rscratch, int vec_enc) {
5129   if (VM_Version::supports_avx512dq()) {
5130     evcvttpd2qq(dst, src, vec_enc);
5131     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5132     switch(to_elem_bt) {
5133       case T_LONG:
5134         break;
5135       case T_INT:
5136         evpmovsqd(dst, dst, vec_enc);
5137         break;
5138       case T_SHORT:
5139         evpmovsqd(dst, dst, vec_enc);
5140         evpmovdw(dst, dst, vec_enc);
5141         break;
5142       case T_BYTE:
5143         evpmovsqd(dst, dst, vec_enc);
5144         evpmovdb(dst, dst, vec_enc);
5145         break;
5146       default: assert(false, "%s", type2name(to_elem_bt));
5147     }
5148   } else {
5149     assert(type2aelembytes(to_elem_bt) <= 4, "");
5150     vcvttpd2dq(dst, src, vec_enc);
5151     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5152     switch(to_elem_bt) {
5153       case T_INT:
5154         break;
5155       case T_SHORT:
5156         evpmovdw(dst, dst, vec_enc);
5157         break;
5158       case T_BYTE:
5159         evpmovdb(dst, dst, vec_enc);
5160         break;
5161       default: assert(false, "%s", type2name(to_elem_bt));
5162     }
5163   }
5164 }
5165 
5166 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5167                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5168                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5169   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5170   // and re-instantiate original MXCSR.RC mode after that.
5171   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5172 
5173   mov64(tmp, julong_cast(0.5L));
5174   evpbroadcastq(xtmp1, tmp, vec_enc);
5175   vaddpd(xtmp1, src , xtmp1, vec_enc);
5176   evcvtpd2qq(dst, xtmp1, vec_enc);
5177   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5178                                                 double_sign_flip, vec_enc);;
5179 
5180   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5181 }
5182 
5183 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5184                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5185                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5186   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5187   // and re-instantiate original MXCSR.RC mode after that.
5188   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5189 
5190   movl(tmp, jint_cast(0.5));
5191   movq(xtmp1, tmp);
5192   vbroadcastss(xtmp1, xtmp1, vec_enc);
5193   vaddps(xtmp1, src , xtmp1, vec_enc);
5194   vcvtps2dq(dst, xtmp1, vec_enc);
5195   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5196                                               float_sign_flip, vec_enc);
5197 
5198   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5199 }
5200 
5201 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5202                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5203                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5204   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5205   // and re-instantiate original MXCSR.RC mode after that.
5206   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5207 
5208   movl(tmp, jint_cast(0.5));
5209   movq(xtmp1, tmp);
5210   vbroadcastss(xtmp1, xtmp1, vec_enc);
5211   vaddps(xtmp1, src , xtmp1, vec_enc);
5212   vcvtps2dq(dst, xtmp1, vec_enc);
5213   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5214 
5215   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5216 }
5217 
5218 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5219                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5220   switch (from_elem_bt) {
5221     case T_BYTE:
5222       switch (to_elem_bt) {
5223         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5224         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5225         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5226         default: ShouldNotReachHere();
5227       }
5228       break;
5229     case T_SHORT:
5230       switch (to_elem_bt) {
5231         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5232         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5233         default: ShouldNotReachHere();
5234       }
5235       break;
5236     case T_INT:
5237       assert(to_elem_bt == T_LONG, "");
5238       vpmovzxdq(dst, src, vlen_enc);
5239       break;
5240     default:
5241       ShouldNotReachHere();
5242   }
5243 }
5244 
5245 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5246                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5247   switch (from_elem_bt) {
5248     case T_BYTE:
5249       switch (to_elem_bt) {
5250         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5251         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5252         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5253         default: ShouldNotReachHere();
5254       }
5255       break;
5256     case T_SHORT:
5257       switch (to_elem_bt) {
5258         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5259         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5260         default: ShouldNotReachHere();
5261       }
5262       break;
5263     case T_INT:
5264       assert(to_elem_bt == T_LONG, "");
5265       vpmovsxdq(dst, src, vlen_enc);
5266       break;
5267     default:
5268       ShouldNotReachHere();
5269   }
5270 }
5271 
5272 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5273                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5274   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5275   assert(vlen_enc != AVX_512bit, "");
5276 
5277   int dst_bt_size = type2aelembytes(dst_bt);
5278   int src_bt_size = type2aelembytes(src_bt);
5279   if (dst_bt_size > src_bt_size) {
5280     switch (dst_bt_size / src_bt_size) {
5281       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5282       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5283       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5284       default: ShouldNotReachHere();
5285     }
5286   } else {
5287     assert(dst_bt_size < src_bt_size, "");
5288     switch (src_bt_size / dst_bt_size) {
5289       case 2: {
5290         if (vlen_enc == AVX_128bit) {
5291           vpacksswb(dst, src, src, vlen_enc);
5292         } else {
5293           vpacksswb(dst, src, src, vlen_enc);
5294           vpermq(dst, dst, 0x08, vlen_enc);
5295         }
5296         break;
5297       }
5298       case 4: {
5299         if (vlen_enc == AVX_128bit) {
5300           vpackssdw(dst, src, src, vlen_enc);
5301           vpacksswb(dst, dst, dst, vlen_enc);
5302         } else {
5303           vpackssdw(dst, src, src, vlen_enc);
5304           vpermq(dst, dst, 0x08, vlen_enc);
5305           vpacksswb(dst, dst, dst, AVX_128bit);
5306         }
5307         break;
5308       }
5309       case 8: {
5310         if (vlen_enc == AVX_128bit) {
5311           vpshufd(dst, src, 0x08, vlen_enc);
5312           vpackssdw(dst, dst, dst, vlen_enc);
5313           vpacksswb(dst, dst, dst, vlen_enc);
5314         } else {
5315           vpshufd(dst, src, 0x08, vlen_enc);
5316           vpermq(dst, dst, 0x08, vlen_enc);
5317           vpackssdw(dst, dst, dst, AVX_128bit);
5318           vpacksswb(dst, dst, dst, AVX_128bit);
5319         }
5320         break;
5321       }
5322       default: ShouldNotReachHere();
5323     }
5324   }
5325 }
5326 
5327 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5328                                    bool merge, BasicType bt, int vlen_enc) {
5329   if (bt == T_INT) {
5330     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5331   } else {
5332     assert(bt == T_LONG, "");
5333     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5334   }
5335 }
5336 
5337 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5338                                    bool merge, BasicType bt, int vlen_enc) {
5339   if (bt == T_INT) {
5340     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5341   } else {
5342     assert(bt == T_LONG, "");
5343     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5344   }
5345 }
5346 
5347 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5348                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5349                                                int vec_enc) {
5350   int index = 0;
5351   int vindex = 0;
5352   mov64(rtmp1, 0x0101010101010101L);
5353   pdepq(rtmp1, src, rtmp1);
5354   if (mask_len > 8) {
5355     movq(rtmp2, src);
5356     vpxor(xtmp, xtmp, xtmp, vec_enc);
5357     movq(xtmp, rtmp1);
5358   }
5359   movq(dst, rtmp1);
5360 
5361   mask_len -= 8;
5362   while (mask_len > 0) {
5363     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5364     index++;
5365     if ((index % 2) == 0) {
5366       pxor(xtmp, xtmp);
5367     }
5368     mov64(rtmp1, 0x0101010101010101L);
5369     shrq(rtmp2, 8);
5370     pdepq(rtmp1, rtmp2, rtmp1);
5371     pinsrq(xtmp, rtmp1, index % 2);
5372     vindex = index / 2;
5373     if (vindex) {
5374       // Write entire 16 byte vector when both 64 bit
5375       // lanes are update to save redundant instructions.
5376       if (index % 2) {
5377         vinsertf128(dst, dst, xtmp, vindex);
5378       }
5379     } else {
5380       vmovdqu(dst, xtmp);
5381     }
5382     mask_len -= 8;
5383   }
5384 }
5385 
5386 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5387   switch(opc) {
5388     case Op_VectorMaskTrueCount:
5389       popcntq(dst, tmp);
5390       break;
5391     case Op_VectorMaskLastTrue:
5392       if (VM_Version::supports_lzcnt()) {
5393         lzcntq(tmp, tmp);
5394         movl(dst, 63);
5395         subl(dst, tmp);
5396       } else {
5397         movl(dst, -1);
5398         bsrq(tmp, tmp);
5399         cmov32(Assembler::notZero, dst, tmp);
5400       }
5401       break;
5402     case Op_VectorMaskFirstTrue:
5403       if (VM_Version::supports_bmi1()) {
5404         if (masklen < 32) {
5405           orl(tmp, 1 << masklen);
5406           tzcntl(dst, tmp);
5407         } else if (masklen == 32) {
5408           tzcntl(dst, tmp);
5409         } else {
5410           assert(masklen == 64, "");
5411           tzcntq(dst, tmp);
5412         }
5413       } else {
5414         if (masklen < 32) {
5415           orl(tmp, 1 << masklen);
5416           bsfl(dst, tmp);
5417         } else {
5418           assert(masklen == 32 || masklen == 64, "");
5419           movl(dst, masklen);
5420           if (masklen == 32)  {
5421             bsfl(tmp, tmp);
5422           } else {
5423             bsfq(tmp, tmp);
5424           }
5425           cmov32(Assembler::notZero, dst, tmp);
5426         }
5427       }
5428       break;
5429     case Op_VectorMaskToLong:
5430       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5431       break;
5432     default: assert(false, "Unhandled mask operation");
5433   }
5434 }
5435 
5436 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5437                                               int masklen, int masksize, int vec_enc) {
5438   assert(VM_Version::supports_popcnt(), "");
5439 
5440   if(VM_Version::supports_avx512bw()) {
5441     kmovql(tmp, mask);
5442   } else {
5443     assert(masklen <= 16, "");
5444     kmovwl(tmp, mask);
5445   }
5446 
5447   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5448   // operations needs to be clipped.
5449   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5450     andq(tmp, (1 << masklen) - 1);
5451   }
5452 
5453   vector_mask_operation_helper(opc, dst, tmp, masklen);
5454 }
5455 
5456 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5457                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5458   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5459          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5460   assert(VM_Version::supports_popcnt(), "");
5461 
5462   bool need_clip = false;
5463   switch(bt) {
5464     case T_BOOLEAN:
5465       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5466       vpxor(xtmp, xtmp, xtmp, vec_enc);
5467       vpsubb(xtmp, xtmp, mask, vec_enc);
5468       vpmovmskb(tmp, xtmp, vec_enc);
5469       need_clip = masklen < 16;
5470       break;
5471     case T_BYTE:
5472       vpmovmskb(tmp, mask, vec_enc);
5473       need_clip = masklen < 16;
5474       break;
5475     case T_SHORT:
5476       vpacksswb(xtmp, mask, mask, vec_enc);
5477       if (masklen >= 16) {
5478         vpermpd(xtmp, xtmp, 8, vec_enc);
5479       }
5480       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5481       need_clip = masklen < 16;
5482       break;
5483     case T_INT:
5484     case T_FLOAT:
5485       vmovmskps(tmp, mask, vec_enc);
5486       need_clip = masklen < 4;
5487       break;
5488     case T_LONG:
5489     case T_DOUBLE:
5490       vmovmskpd(tmp, mask, vec_enc);
5491       need_clip = masklen < 2;
5492       break;
5493     default: assert(false, "Unhandled type, %s", type2name(bt));
5494   }
5495 
5496   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5497   // operations needs to be clipped.
5498   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5499     // need_clip implies masklen < 32
5500     andq(tmp, (1 << masklen) - 1);
5501   }
5502 
5503   vector_mask_operation_helper(opc, dst, tmp, masklen);
5504 }
5505 
5506 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5507                                              Register rtmp2, int mask_len) {
5508   kmov(rtmp1, src);
5509   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5510   mov64(rtmp2, -1L);
5511   pextq(rtmp2, rtmp2, rtmp1);
5512   kmov(dst, rtmp2);
5513 }
5514 
5515 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5516                                                     XMMRegister mask, Register rtmp, Register rscratch,
5517                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5518                                                     int vec_enc) {
5519   assert(type2aelembytes(bt) >= 4, "");
5520   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5521   address compress_perm_table = nullptr;
5522   address expand_perm_table = nullptr;
5523   if (type2aelembytes(bt) == 8) {
5524     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5525     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5526     vmovmskpd(rtmp, mask, vec_enc);
5527   } else {
5528     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5529     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5530     vmovmskps(rtmp, mask, vec_enc);
5531   }
5532   shlq(rtmp, 5); // for 32 byte permute row.
5533   if (opcode == Op_CompressV) {
5534     lea(rscratch, ExternalAddress(compress_perm_table));
5535   } else {
5536     lea(rscratch, ExternalAddress(expand_perm_table));
5537   }
5538   addptr(rtmp, rscratch);
5539   vmovdqu(permv, Address(rtmp));
5540   vpermps(dst, permv, src, Assembler::AVX_256bit);
5541   vpxor(xtmp, xtmp, xtmp, vec_enc);
5542   // Blend the result with zero vector using permute mask, each column entry
5543   // in a permute table row contains either a valid permute index or a -1 (default)
5544   // value, this can potentially be used as a blending mask after
5545   // compressing/expanding the source vector lanes.
5546   vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv);
5547 }
5548 
5549 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5550                                                bool merge, BasicType bt, int vec_enc) {
5551   if (opcode == Op_CompressV) {
5552     switch(bt) {
5553     case T_BYTE:
5554       evpcompressb(dst, mask, src, merge, vec_enc);
5555       break;
5556     case T_CHAR:
5557     case T_SHORT:
5558       evpcompressw(dst, mask, src, merge, vec_enc);
5559       break;
5560     case T_INT:
5561       evpcompressd(dst, mask, src, merge, vec_enc);
5562       break;
5563     case T_FLOAT:
5564       evcompressps(dst, mask, src, merge, vec_enc);
5565       break;
5566     case T_LONG:
5567       evpcompressq(dst, mask, src, merge, vec_enc);
5568       break;
5569     case T_DOUBLE:
5570       evcompresspd(dst, mask, src, merge, vec_enc);
5571       break;
5572     default:
5573       fatal("Unsupported type %s", type2name(bt));
5574       break;
5575     }
5576   } else {
5577     assert(opcode == Op_ExpandV, "");
5578     switch(bt) {
5579     case T_BYTE:
5580       evpexpandb(dst, mask, src, merge, vec_enc);
5581       break;
5582     case T_CHAR:
5583     case T_SHORT:
5584       evpexpandw(dst, mask, src, merge, vec_enc);
5585       break;
5586     case T_INT:
5587       evpexpandd(dst, mask, src, merge, vec_enc);
5588       break;
5589     case T_FLOAT:
5590       evexpandps(dst, mask, src, merge, vec_enc);
5591       break;
5592     case T_LONG:
5593       evpexpandq(dst, mask, src, merge, vec_enc);
5594       break;
5595     case T_DOUBLE:
5596       evexpandpd(dst, mask, src, merge, vec_enc);
5597       break;
5598     default:
5599       fatal("Unsupported type %s", type2name(bt));
5600       break;
5601     }
5602   }
5603 }
5604 
5605 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5606                                            KRegister ktmp1, int vec_enc) {
5607   if (opcode == Op_SignumVD) {
5608     vsubpd(dst, zero, one, vec_enc);
5609     // if src < 0 ? -1 : 1
5610     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5611     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5612     // if src == NaN, -0.0 or 0.0 return src.
5613     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5614     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5615   } else {
5616     assert(opcode == Op_SignumVF, "");
5617     vsubps(dst, zero, one, vec_enc);
5618     // if src < 0 ? -1 : 1
5619     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5620     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5621     // if src == NaN, -0.0 or 0.0 return src.
5622     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5623     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5624   }
5625 }
5626 
5627 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5628                                           XMMRegister xtmp1, int vec_enc) {
5629   if (opcode == Op_SignumVD) {
5630     vsubpd(dst, zero, one, vec_enc);
5631     // if src < 0 ? -1 : 1
5632     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5633     // if src == NaN, -0.0 or 0.0 return src.
5634     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5635     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5636   } else {
5637     assert(opcode == Op_SignumVF, "");
5638     vsubps(dst, zero, one, vec_enc);
5639     // if src < 0 ? -1 : 1
5640     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5641     // if src == NaN, -0.0 or 0.0 return src.
5642     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5643     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5644   }
5645 }
5646 
5647 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5648   if (VM_Version::supports_avx512bw()) {
5649     if (mask_len > 32) {
5650       kmovql(dst, src);
5651     } else {
5652       kmovdl(dst, src);
5653       if (mask_len != 32) {
5654         kshiftrdl(dst, dst, 32 - mask_len);
5655       }
5656     }
5657   } else {
5658     assert(mask_len <= 16, "");
5659     kmovwl(dst, src);
5660     if (mask_len != 16) {
5661       kshiftrwl(dst, dst, 16 - mask_len);
5662     }
5663   }
5664 }
5665 
5666 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5667   int lane_size = type2aelembytes(bt);
5668   if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5669       (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) {
5670     movptr(rtmp, imm32);
5671     switch(lane_size) {
5672       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5673       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5674       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5675       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5676       fatal("Unsupported lane size %d", lane_size);
5677       break;
5678     }
5679   } else {
5680     movptr(rtmp, imm32);
5681     movq(dst, rtmp);
5682     switch(lane_size) {
5683       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5684       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5685       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5686       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5687       fatal("Unsupported lane size %d", lane_size);
5688       break;
5689     }
5690   }
5691 }
5692 
5693 //
5694 // Following is lookup table based popcount computation algorithm:-
5695 //       Index   Bit set count
5696 //     [ 0000 ->   0,
5697 //       0001 ->   1,
5698 //       0010 ->   1,
5699 //       0011 ->   2,
5700 //       0100 ->   1,
5701 //       0101 ->   2,
5702 //       0110 ->   2,
5703 //       0111 ->   3,
5704 //       1000 ->   1,
5705 //       1001 ->   2,
5706 //       1010 ->   3,
5707 //       1011 ->   3,
5708 //       1100 ->   2,
5709 //       1101 ->   3,
5710 //       1111 ->   4 ]
5711 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5712 //     shuffle indices for lookup table access.
5713 //  b. Right shift each byte of vector lane by 4 positions.
5714 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5715 //     shuffle indices for lookup table access.
5716 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5717 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5718 //     count of all the bytes of a quadword.
5719 //  f. Perform step e. for upper 128bit vector lane.
5720 //  g. Pack the bitset count of quadwords back to double word.
5721 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5722 
5723 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5724                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5725   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5726   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5727   vpsrlw(dst, src, 4, vec_enc);
5728   vpand(dst, dst, xtmp1, vec_enc);
5729   vpand(xtmp1, src, xtmp1, vec_enc);
5730   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5731   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5732   vpshufb(dst, xtmp2, dst, vec_enc);
5733   vpaddb(dst, dst, xtmp1, vec_enc);
5734 }
5735 
5736 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5737                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5738   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5739   // Following code is as per steps e,f,g and h of above algorithm.
5740   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5741   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5742   vpsadbw(dst, dst, xtmp2, vec_enc);
5743   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5744   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5745   vpackuswb(dst, xtmp1, dst, vec_enc);
5746 }
5747 
5748 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5749                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5750   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5751   // Add the popcount of upper and lower bytes of word.
5752   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5753   vpsrlw(dst, xtmp1, 8, vec_enc);
5754   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5755   vpaddw(dst, dst, xtmp1, vec_enc);
5756 }
5757 
5758 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5759                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5760   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5761   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5762   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5763 }
5764 
5765 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5766                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5767   switch(bt) {
5768     case T_LONG:
5769       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5770       break;
5771     case T_INT:
5772       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5773       break;
5774     case T_CHAR:
5775     case T_SHORT:
5776       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5777       break;
5778     case T_BYTE:
5779     case T_BOOLEAN:
5780       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5781       break;
5782     default:
5783       fatal("Unsupported type %s", type2name(bt));
5784       break;
5785   }
5786 }
5787 
5788 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5789                                                       KRegister mask, bool merge, int vec_enc) {
5790   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5791   switch(bt) {
5792     case T_LONG:
5793       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5794       evpopcntq(dst, mask, src, merge, vec_enc);
5795       break;
5796     case T_INT:
5797       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5798       evpopcntd(dst, mask, src, merge, vec_enc);
5799       break;
5800     case T_CHAR:
5801     case T_SHORT:
5802       assert(VM_Version::supports_avx512_bitalg(), "");
5803       evpopcntw(dst, mask, src, merge, vec_enc);
5804       break;
5805     case T_BYTE:
5806     case T_BOOLEAN:
5807       assert(VM_Version::supports_avx512_bitalg(), "");
5808       evpopcntb(dst, mask, src, merge, vec_enc);
5809       break;
5810     default:
5811       fatal("Unsupported type %s", type2name(bt));
5812       break;
5813   }
5814 }
5815 
5816 // Bit reversal algorithm first reverses the bits of each byte followed by
5817 // a byte level reversal for multi-byte primitive types (short/int/long).
5818 // Algorithm performs a lookup table access to get reverse bit sequence
5819 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5820 // is obtained by swapping the reverse bit sequences of upper and lower
5821 // nibble of a byte.
5822 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5823                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5824   if (VM_Version::supports_avx512vlbw()) {
5825 
5826     // Get the reverse bit sequence of lower nibble of each byte.
5827     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5828     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5829     evpandq(dst, xtmp2, src, vec_enc);
5830     vpshufb(dst, xtmp1, dst, vec_enc);
5831     vpsllq(dst, dst, 4, vec_enc);
5832 
5833     // Get the reverse bit sequence of upper nibble of each byte.
5834     vpandn(xtmp2, xtmp2, src, vec_enc);
5835     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5836     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5837 
5838     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5839     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5840     evporq(xtmp2, dst, xtmp2, vec_enc);
5841     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5842 
5843   } else if(vec_enc == Assembler::AVX_512bit) {
5844     // Shift based bit reversal.
5845     assert(bt == T_LONG || bt == T_INT, "");
5846 
5847     // Swap lower and upper nibble of each byte.
5848     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5849 
5850     // Swap two least and most significant bits of each nibble.
5851     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5852 
5853     // Swap adjacent pair of bits.
5854     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5855     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5856 
5857     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5858     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5859   } else {
5860     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5861     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5862 
5863     // Get the reverse bit sequence of lower nibble of each byte.
5864     vpand(dst, xtmp2, src, vec_enc);
5865     vpshufb(dst, xtmp1, dst, vec_enc);
5866     vpsllq(dst, dst, 4, vec_enc);
5867 
5868     // Get the reverse bit sequence of upper nibble of each byte.
5869     vpandn(xtmp2, xtmp2, src, vec_enc);
5870     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5871     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5872 
5873     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5874     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5875     vpor(xtmp2, dst, xtmp2, vec_enc);
5876     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5877   }
5878 }
5879 
5880 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5881                                                 XMMRegister xtmp, Register rscratch) {
5882   assert(VM_Version::supports_gfni(), "");
5883   assert(rscratch != noreg || always_reachable(mask), "missing");
5884 
5885   // Galois field instruction based bit reversal based on following algorithm.
5886   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5887   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5888   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5889   vector_reverse_byte(bt, dst, xtmp, vec_enc);
5890 }
5891 
5892 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5893                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
5894   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5895   evpandq(dst, xtmp1, src, vec_enc);
5896   vpsllq(dst, dst, nbits, vec_enc);
5897   vpandn(xtmp1, xtmp1, src, vec_enc);
5898   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
5899   evporq(dst, dst, xtmp1, vec_enc);
5900 }
5901 
5902 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5903                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5904   // Shift based bit reversal.
5905   assert(VM_Version::supports_evex(), "");
5906   switch(bt) {
5907     case T_LONG:
5908       // Swap upper and lower double word of each quad word.
5909       evprorq(xtmp1, k0, src, 32, true, vec_enc);
5910       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
5911       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5912       break;
5913     case T_INT:
5914       // Swap upper and lower word of each double word.
5915       evprord(xtmp1, k0, src, 16, true, vec_enc);
5916       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5917       break;
5918     case T_CHAR:
5919     case T_SHORT:
5920       // Swap upper and lower byte of each word.
5921       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
5922       break;
5923     case T_BYTE:
5924       evmovdquq(dst, k0, src, true, vec_enc);
5925       break;
5926     default:
5927       fatal("Unsupported type %s", type2name(bt));
5928       break;
5929   }
5930 }
5931 
5932 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5933   if (bt == T_BYTE) {
5934     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
5935       evmovdquq(dst, k0, src, true, vec_enc);
5936     } else {
5937       vmovdqu(dst, src);
5938     }
5939     return;
5940   }
5941   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
5942   // pre-computed shuffle indices.
5943   switch(bt) {
5944     case T_LONG:
5945       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
5946       break;
5947     case T_INT:
5948       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
5949       break;
5950     case T_CHAR:
5951     case T_SHORT:
5952       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
5953       break;
5954     default:
5955       fatal("Unsupported type %s", type2name(bt));
5956       break;
5957   }
5958   vpshufb(dst, src, dst, vec_enc);
5959 }
5960 
5961 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5962                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5963                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
5964   assert(is_integral_type(bt), "");
5965   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5966   assert(VM_Version::supports_avx512cd(), "");
5967   switch(bt) {
5968     case T_LONG:
5969       evplzcntq(dst, ktmp, src, merge, vec_enc);
5970       break;
5971     case T_INT:
5972       evplzcntd(dst, ktmp, src, merge, vec_enc);
5973       break;
5974     case T_SHORT:
5975       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
5976       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
5977       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
5978       vpunpckhwd(dst, xtmp1, src, vec_enc);
5979       evplzcntd(dst, ktmp, dst, merge, vec_enc);
5980       vpackusdw(dst, xtmp2, dst, vec_enc);
5981       break;
5982     case T_BYTE:
5983       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5984       // accessing the lookup table.
5985       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5986       // accessing the lookup table.
5987       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5988       assert(VM_Version::supports_avx512bw(), "");
5989       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
5990       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
5991       vpand(xtmp2, dst, src, vec_enc);
5992       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5993       vpsrlw(xtmp3, src, 4, vec_enc);
5994       vpand(xtmp3, dst, xtmp3, vec_enc);
5995       vpshufb(dst, xtmp1, xtmp3, vec_enc);
5996       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
5997       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
5998       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
5999       break;
6000     default:
6001       fatal("Unsupported type %s", type2name(bt));
6002       break;
6003   }
6004 }
6005 
6006 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6007                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6008   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6009   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6010   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6011   // accessing the lookup table.
6012   vpand(dst, xtmp2, src, vec_enc);
6013   vpshufb(dst, xtmp1, dst, vec_enc);
6014   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6015   // accessing the lookup table.
6016   vpsrlw(xtmp3, src, 4, vec_enc);
6017   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6018   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6019   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6020   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6021   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6022   vpaddb(dst, dst, xtmp2, vec_enc);
6023   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6024 }
6025 
6026 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6027                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6028   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6029   // Add zero counts of lower byte and upper byte of a word if
6030   // upper byte holds a zero value.
6031   vpsrlw(xtmp3, src, 8, vec_enc);
6032   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6033   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6034   vpsllw(xtmp2, dst, 8, vec_enc);
6035   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6036   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6037   vpsrlw(dst, dst, 8, vec_enc);
6038 }
6039 
6040 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6041                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6042   // Since IEEE 754 floating point format represents mantissa in 1.0 format
6043   // hence biased exponent can be used to compute leading zero count as per
6044   // following formula:-
6045   // LZCNT = 31 - (biased_exp - 127)
6046   // Special handling has been introduced for Zero, Max_Int and -ve source values.
6047 
6048   // Broadcast 0xFF
6049   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6050   vpsrld(xtmp1, xtmp1, 24, vec_enc);
6051 
6052   // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher
6053   // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit
6054   // contributes to the leading number of zeros.
6055   vpsrld(xtmp2, src, 1, vec_enc);
6056   vpandn(xtmp3, xtmp2, src, vec_enc);
6057 
6058   // Extract biased exponent.
6059   vcvtdq2ps(dst, xtmp3, vec_enc);
6060   vpsrld(dst, dst, 23, vec_enc);
6061   vpand(dst, dst, xtmp1, vec_enc);
6062 
6063   // Broadcast 127.
6064   vpsrld(xtmp1, xtmp1, 1, vec_enc);
6065   // Exponent = biased_exp - 127
6066   vpsubd(dst, dst, xtmp1, vec_enc);
6067 
6068   // Exponent_plus_one = Exponent + 1
6069   vpsrld(xtmp3, xtmp1, 6, vec_enc);
6070   vpaddd(dst, dst, xtmp3, vec_enc);
6071 
6072   // Replace -ve exponent with zero, exponent is -ve when src
6073   // lane contains a zero value.
6074   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6075   vblendvps(dst, dst, xtmp2, dst, vec_enc);
6076 
6077   // Rematerialize broadcast 32.
6078   vpslld(xtmp1, xtmp3, 5, vec_enc);
6079   // Exponent is 32 if corresponding source lane contains max_int value.
6080   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
6081   // LZCNT = 32 - exponent_plus_one
6082   vpsubd(dst, xtmp1, dst, vec_enc);
6083 
6084   // Replace LZCNT with a value 1 if corresponding source lane
6085   // contains max_int value.
6086   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
6087 
6088   // Replace biased_exp with 0 if source lane value is less than zero.
6089   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6090   vblendvps(dst, dst, xtmp2, src, vec_enc);
6091 }
6092 
6093 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6094                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6095   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6096   // Add zero counts of lower word and upper word of a double word if
6097   // upper word holds a zero value.
6098   vpsrld(xtmp3, src, 16, vec_enc);
6099   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6100   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
6101   vpslld(xtmp2, dst, 16, vec_enc);
6102   vpaddd(xtmp2, xtmp2, dst, vec_enc);
6103   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6104   vpsrld(dst, dst, 16, vec_enc);
6105   // Add zero counts of lower doubleword and upper doubleword of a
6106   // quadword if upper doubleword holds a zero value.
6107   vpsrlq(xtmp3, src, 32, vec_enc);
6108   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
6109   vpsllq(xtmp2, dst, 32, vec_enc);
6110   vpaddq(xtmp2, xtmp2, dst, vec_enc);
6111   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6112   vpsrlq(dst, dst, 32, vec_enc);
6113 }
6114 
6115 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6116                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6117                                                        Register rtmp, int vec_enc) {
6118   assert(is_integral_type(bt), "unexpected type");
6119   assert(vec_enc < Assembler::AVX_512bit, "");
6120   switch(bt) {
6121     case T_LONG:
6122       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6123       break;
6124     case T_INT:
6125       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6126       break;
6127     case T_SHORT:
6128       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6129       break;
6130     case T_BYTE:
6131       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6132       break;
6133     default:
6134       fatal("Unsupported type %s", type2name(bt));
6135       break;
6136   }
6137 }
6138 
6139 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6140   switch(bt) {
6141     case T_BYTE:
6142       vpsubb(dst, src1, src2, vec_enc);
6143       break;
6144     case T_SHORT:
6145       vpsubw(dst, src1, src2, vec_enc);
6146       break;
6147     case T_INT:
6148       vpsubd(dst, src1, src2, vec_enc);
6149       break;
6150     case T_LONG:
6151       vpsubq(dst, src1, src2, vec_enc);
6152       break;
6153     default:
6154       fatal("Unsupported type %s", type2name(bt));
6155       break;
6156   }
6157 }
6158 
6159 // Trailing zero count computation is based on leading zero count operation as per
6160 // following equation. All AVX3 targets support AVX512CD feature which offers
6161 // direct vector instruction to compute leading zero count.
6162 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6163 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6164                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6165                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6166   assert(is_integral_type(bt), "");
6167   // xtmp = -1
6168   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6169   // xtmp = xtmp + src
6170   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6171   // xtmp = xtmp & ~src
6172   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6173   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6174   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6175   vpsub(bt, dst, xtmp4, dst, vec_enc);
6176 }
6177 
6178 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6179 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6180 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6181                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6182   assert(is_integral_type(bt), "");
6183   // xtmp = 0
6184   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6185   // xtmp = 0 - src
6186   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6187   // xtmp = xtmp | src
6188   vpor(xtmp3, xtmp3, src, vec_enc);
6189   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6190   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6191   vpsub(bt, dst, xtmp1, dst, vec_enc);
6192 }
6193 
6194 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6195   Label done;
6196   Label neg_divisor_fastpath;
6197   cmpl(divisor, 0);
6198   jccb(Assembler::less, neg_divisor_fastpath);
6199   xorl(rdx, rdx);
6200   divl(divisor);
6201   jmpb(done);
6202   bind(neg_divisor_fastpath);
6203   // Fastpath for divisor < 0:
6204   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6205   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6206   movl(rdx, rax);
6207   subl(rdx, divisor);
6208   if (VM_Version::supports_bmi1()) {
6209     andnl(rax, rdx, rax);
6210   } else {
6211     notl(rdx);
6212     andl(rax, rdx);
6213   }
6214   shrl(rax, 31);
6215   bind(done);
6216 }
6217 
6218 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6219   Label done;
6220   Label neg_divisor_fastpath;
6221   cmpl(divisor, 0);
6222   jccb(Assembler::less, neg_divisor_fastpath);
6223   xorl(rdx, rdx);
6224   divl(divisor);
6225   jmpb(done);
6226   bind(neg_divisor_fastpath);
6227   // Fastpath when divisor < 0:
6228   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6229   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6230   movl(rdx, rax);
6231   subl(rax, divisor);
6232   if (VM_Version::supports_bmi1()) {
6233     andnl(rax, rax, rdx);
6234   } else {
6235     notl(rax);
6236     andl(rax, rdx);
6237   }
6238   sarl(rax, 31);
6239   andl(rax, divisor);
6240   subl(rdx, rax);
6241   bind(done);
6242 }
6243 
6244 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6245   Label done;
6246   Label neg_divisor_fastpath;
6247 
6248   cmpl(divisor, 0);
6249   jccb(Assembler::less, neg_divisor_fastpath);
6250   xorl(rdx, rdx);
6251   divl(divisor);
6252   jmpb(done);
6253   bind(neg_divisor_fastpath);
6254   // Fastpath for divisor < 0:
6255   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6256   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6257   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6258   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6259   movl(rdx, rax);
6260   subl(rax, divisor);
6261   if (VM_Version::supports_bmi1()) {
6262     andnl(rax, rax, rdx);
6263   } else {
6264     notl(rax);
6265     andl(rax, rdx);
6266   }
6267   movl(tmp, rax);
6268   shrl(rax, 31); // quotient
6269   sarl(tmp, 31);
6270   andl(tmp, divisor);
6271   subl(rdx, tmp); // remainder
6272   bind(done);
6273 }
6274 
6275 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6276                                  XMMRegister xtmp2, Register rtmp) {
6277   if(VM_Version::supports_gfni()) {
6278     // Galois field instruction based bit reversal based on following algorithm.
6279     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6280     mov64(rtmp, 0x8040201008040201L);
6281     movq(xtmp1, src);
6282     movq(xtmp2, rtmp);
6283     gf2p8affineqb(xtmp1, xtmp2, 0);
6284     movq(dst, xtmp1);
6285   } else {
6286     // Swap even and odd numbered bits.
6287     movl(rtmp, src);
6288     andl(rtmp, 0x55555555);
6289     shll(rtmp, 1);
6290     movl(dst, src);
6291     andl(dst, 0xAAAAAAAA);
6292     shrl(dst, 1);
6293     orl(dst, rtmp);
6294 
6295     // Swap LSB and MSB 2 bits of each nibble.
6296     movl(rtmp, dst);
6297     andl(rtmp, 0x33333333);
6298     shll(rtmp, 2);
6299     andl(dst, 0xCCCCCCCC);
6300     shrl(dst, 2);
6301     orl(dst, rtmp);
6302 
6303     // Swap LSB and MSB 4 bits of each byte.
6304     movl(rtmp, dst);
6305     andl(rtmp, 0x0F0F0F0F);
6306     shll(rtmp, 4);
6307     andl(dst, 0xF0F0F0F0);
6308     shrl(dst, 4);
6309     orl(dst, rtmp);
6310   }
6311   bswapl(dst);
6312 }
6313 
6314 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6315                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6316   if(VM_Version::supports_gfni()) {
6317     // Galois field instruction based bit reversal based on following algorithm.
6318     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6319     mov64(rtmp1, 0x8040201008040201L);
6320     movq(xtmp1, src);
6321     movq(xtmp2, rtmp1);
6322     gf2p8affineqb(xtmp1, xtmp2, 0);
6323     movq(dst, xtmp1);
6324   } else {
6325     // Swap even and odd numbered bits.
6326     movq(rtmp1, src);
6327     mov64(rtmp2, 0x5555555555555555L);
6328     andq(rtmp1, rtmp2);
6329     shlq(rtmp1, 1);
6330     movq(dst, src);
6331     notq(rtmp2);
6332     andq(dst, rtmp2);
6333     shrq(dst, 1);
6334     orq(dst, rtmp1);
6335 
6336     // Swap LSB and MSB 2 bits of each nibble.
6337     movq(rtmp1, dst);
6338     mov64(rtmp2, 0x3333333333333333L);
6339     andq(rtmp1, rtmp2);
6340     shlq(rtmp1, 2);
6341     notq(rtmp2);
6342     andq(dst, rtmp2);
6343     shrq(dst, 2);
6344     orq(dst, rtmp1);
6345 
6346     // Swap LSB and MSB 4 bits of each byte.
6347     movq(rtmp1, dst);
6348     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6349     andq(rtmp1, rtmp2);
6350     shlq(rtmp1, 4);
6351     notq(rtmp2);
6352     andq(dst, rtmp2);
6353     shrq(dst, 4);
6354     orq(dst, rtmp1);
6355   }
6356   bswapq(dst);
6357 }
6358 
6359 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6360   Label done;
6361   Label neg_divisor_fastpath;
6362   cmpq(divisor, 0);
6363   jccb(Assembler::less, neg_divisor_fastpath);
6364   xorl(rdx, rdx);
6365   divq(divisor);
6366   jmpb(done);
6367   bind(neg_divisor_fastpath);
6368   // Fastpath for divisor < 0:
6369   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6370   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6371   movq(rdx, rax);
6372   subq(rdx, divisor);
6373   if (VM_Version::supports_bmi1()) {
6374     andnq(rax, rdx, rax);
6375   } else {
6376     notq(rdx);
6377     andq(rax, rdx);
6378   }
6379   shrq(rax, 63);
6380   bind(done);
6381 }
6382 
6383 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6384   Label done;
6385   Label neg_divisor_fastpath;
6386   cmpq(divisor, 0);
6387   jccb(Assembler::less, neg_divisor_fastpath);
6388   xorq(rdx, rdx);
6389   divq(divisor);
6390   jmp(done);
6391   bind(neg_divisor_fastpath);
6392   // Fastpath when divisor < 0:
6393   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6394   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6395   movq(rdx, rax);
6396   subq(rax, divisor);
6397   if (VM_Version::supports_bmi1()) {
6398     andnq(rax, rax, rdx);
6399   } else {
6400     notq(rax);
6401     andq(rax, rdx);
6402   }
6403   sarq(rax, 63);
6404   andq(rax, divisor);
6405   subq(rdx, rax);
6406   bind(done);
6407 }
6408 
6409 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6410   Label done;
6411   Label neg_divisor_fastpath;
6412   cmpq(divisor, 0);
6413   jccb(Assembler::less, neg_divisor_fastpath);
6414   xorq(rdx, rdx);
6415   divq(divisor);
6416   jmp(done);
6417   bind(neg_divisor_fastpath);
6418   // Fastpath for divisor < 0:
6419   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6420   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6421   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6422   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6423   movq(rdx, rax);
6424   subq(rax, divisor);
6425   if (VM_Version::supports_bmi1()) {
6426     andnq(rax, rax, rdx);
6427   } else {
6428     notq(rax);
6429     andq(rax, rdx);
6430   }
6431   movq(tmp, rax);
6432   shrq(rax, 63); // quotient
6433   sarq(tmp, 63);
6434   andq(tmp, divisor);
6435   subq(rdx, tmp); // remainder
6436   bind(done);
6437 }
6438 
6439 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6440                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6441                                         int vlen_enc) {
6442   assert(VM_Version::supports_avx512bw(), "");
6443   // Byte shuffles are inlane operations and indices are determined using
6444   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6445   // normalized to index range 0-15. This makes sure that all the multiples
6446   // of an index value are placed at same relative position in 128 bit
6447   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6448   // will be 16th element in their respective 128 bit lanes.
6449   movl(rtmp, 16);
6450   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6451 
6452   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6453   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6454   // original shuffle indices and move the shuffled lanes corresponding to true
6455   // mask to destination vector.
6456   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6457   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6458   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6459 
6460   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6461   // and broadcasting second 128 bit lane.
6462   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6463   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6464   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6465   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6466   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6467 
6468   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6469   // and broadcasting third 128 bit lane.
6470   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6471   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6472   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6473   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6474   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6475 
6476   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6477   // and broadcasting third 128 bit lane.
6478   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6479   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6480   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6481   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6482   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6483 }
6484 
6485 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6486                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6487   if (vlen_enc == AVX_128bit) {
6488     vpermilps(dst, src, shuffle, vlen_enc);
6489   } else if (bt == T_INT) {
6490     vpermd(dst, shuffle, src, vlen_enc);
6491   } else {
6492     assert(bt == T_FLOAT, "");
6493     vpermps(dst, shuffle, src, vlen_enc);
6494   }
6495 }
6496 
6497 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6498   switch(opcode) {
6499     case Op_AddHF: vaddsh(dst, src1, src2); break;
6500     case Op_SubHF: vsubsh(dst, src1, src2); break;
6501     case Op_MulHF: vmulsh(dst, src1, src2); break;
6502     case Op_DivHF: vdivsh(dst, src1, src2); break;
6503     default: assert(false, "%s", NodeClassNames[opcode]); break;
6504   }
6505 }
6506 
6507 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6508   switch(elem_bt) {
6509     case T_BYTE:
6510       if (ideal_opc == Op_SaturatingAddV) {
6511         vpaddsb(dst, src1, src2, vlen_enc);
6512       } else {
6513         assert(ideal_opc == Op_SaturatingSubV, "");
6514         vpsubsb(dst, src1, src2, vlen_enc);
6515       }
6516       break;
6517     case T_SHORT:
6518       if (ideal_opc == Op_SaturatingAddV) {
6519         vpaddsw(dst, src1, src2, vlen_enc);
6520       } else {
6521         assert(ideal_opc == Op_SaturatingSubV, "");
6522         vpsubsw(dst, src1, src2, vlen_enc);
6523       }
6524       break;
6525     default:
6526       fatal("Unsupported type %s", type2name(elem_bt));
6527       break;
6528   }
6529 }
6530 
6531 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6532   switch(elem_bt) {
6533     case T_BYTE:
6534       if (ideal_opc == Op_SaturatingAddV) {
6535         vpaddusb(dst, src1, src2, vlen_enc);
6536       } else {
6537         assert(ideal_opc == Op_SaturatingSubV, "");
6538         vpsubusb(dst, src1, src2, vlen_enc);
6539       }
6540       break;
6541     case T_SHORT:
6542       if (ideal_opc == Op_SaturatingAddV) {
6543         vpaddusw(dst, src1, src2, vlen_enc);
6544       } else {
6545         assert(ideal_opc == Op_SaturatingSubV, "");
6546         vpsubusw(dst, src1, src2, vlen_enc);
6547       }
6548       break;
6549     default:
6550       fatal("Unsupported type %s", type2name(elem_bt));
6551       break;
6552   }
6553 }
6554 
6555 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6556                                                               XMMRegister src2, KRegister ktmp, int vlen_enc) {
6557   // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6558   // overflow_mask = Inp1 <u Inp2
6559   evpcmpu(elem_bt, ktmp,  src2, src1, Assembler::lt, vlen_enc);
6560   // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6561   evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6562 }
6563 
6564 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6565                                                               XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6566   // Emulate unsigned comparison using signed comparison
6567   // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6568   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6569   vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6570   vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6571 
6572   vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6573 
6574   // Res = INP1 - INP2 (non-commutative and non-associative)
6575   vpsub(elem_bt, dst, src1, src2, vlen_enc);
6576   // Res = Mask ? Zero : Res
6577   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6578   vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6579 }
6580 
6581 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6582                                                                XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6583   // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6584   // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6585   // Res = Signed Add INP1, INP2
6586   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6587   // T1 = SRC1 | SRC2
6588   vpor(xtmp1, src1, src2, vlen_enc);
6589   // Max_Unsigned = -1
6590   vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6591   // Unsigned compare:  Mask = Res <u T1
6592   evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6593   // res  = Mask ? Max_Unsigned : Res
6594   evpblend(elem_bt, dst, ktmp,  dst, xtmp2, true, vlen_enc);
6595 }
6596 
6597 //
6598 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6599 // unsigned addition operation.
6600 //    overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6601 //
6602 // We empirically determined its semantic equivalence to following reduced expression
6603 //    overflow_mask =  (a + b) <u (a | b)
6604 //
6605 // and also verified it though Alive2 solver.
6606 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6607 //
6608 
6609 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6610                                                               XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6611   // Res = Signed Add INP1, INP2
6612   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6613   // Compute T1 = INP1 | INP2
6614   vpor(xtmp3, src1, src2, vlen_enc);
6615   // T1 = Minimum signed value.
6616   vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6617   // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6618   vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6619   // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6620   vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6621   // Compute overflow detection mask = Res<1> <s T1
6622   if (elem_bt == T_INT) {
6623     vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6624   } else {
6625     assert(elem_bt == T_LONG, "");
6626     vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6627   }
6628   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6629 }
6630 
6631 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6632                                       int vlen_enc, bool xtmp2_hold_M1) {
6633   if (VM_Version::supports_avx512dq()) {
6634     evpmovq2m(ktmp, src, vlen_enc);
6635   } else {
6636     assert(VM_Version::supports_evex(), "");
6637     if (!xtmp2_hold_M1) {
6638       vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6639     }
6640     evpsraq(xtmp1, src, 63, vlen_enc);
6641     evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6642   }
6643 }
6644 
6645 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6646                                       int vlen_enc, bool xtmp2_hold_M1) {
6647   if (VM_Version::supports_avx512dq()) {
6648     evpmovd2m(ktmp, src, vlen_enc);
6649   } else {
6650     assert(VM_Version::supports_evex(), "");
6651     if (!xtmp2_hold_M1) {
6652       vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6653     }
6654     vpsrad(xtmp1, src, 31, vlen_enc);
6655     Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6656   }
6657 }
6658 
6659 
6660 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6661   if (elem_bt == T_LONG) {
6662     if (VM_Version::supports_evex()) {
6663       evpsraq(dst, src, 63, vlen_enc);
6664     } else {
6665       vpsrad(dst, src, 31, vlen_enc);
6666       vpshufd(dst, dst, 0xF5, vlen_enc);
6667     }
6668   } else {
6669     assert(elem_bt == T_INT, "");
6670     vpsrad(dst, src, 31, vlen_enc);
6671   }
6672 }
6673 
6674 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6675   if (compute_allones) {
6676     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6677       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6678     } else {
6679       vpcmpeqq(allones, allones, allones, vlen_enc);
6680     }
6681   }
6682   if (elem_bt == T_LONG) {
6683     vpsrlq(dst, allones, 1, vlen_enc);
6684   } else {
6685     assert(elem_bt == T_INT, "");
6686     vpsrld(dst, allones, 1, vlen_enc);
6687   }
6688 }
6689 
6690 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6691   if (compute_allones) {
6692     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6693       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6694     } else {
6695       vpcmpeqq(allones, allones, allones, vlen_enc);
6696     }
6697   }
6698   if (elem_bt == T_LONG) {
6699     vpsllq(dst, allones, 63, vlen_enc);
6700   } else {
6701     assert(elem_bt == T_INT, "");
6702     vpslld(dst, allones, 31, vlen_enc);
6703   }
6704 }
6705 
6706 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask,  XMMRegister src1, XMMRegister src2,
6707                                 Assembler::ComparisonPredicate cond, int vlen_enc) {
6708   switch(elem_bt) {
6709     case T_LONG:  evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6710     case T_INT:   evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6711     case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6712     case T_BYTE:  evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6713     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6714   }
6715 }
6716 
6717 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6718   switch(elem_bt) {
6719     case  T_LONG:  vpcmpgtq(dst, src1, src2, vlen_enc); break;
6720     case  T_INT:   vpcmpgtd(dst, src1, src2, vlen_enc); break;
6721     case  T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6722     case  T_BYTE:  vpcmpgtb(dst, src1, src2, vlen_enc); break;
6723     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6724   }
6725 }
6726 
6727 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6728                                            XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6729   if (elem_bt == T_LONG) {
6730     evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6731   } else {
6732     assert(elem_bt == T_INT, "");
6733     evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6734   }
6735 }
6736 
6737 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6738                                                          XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6739                                                          KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6740   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6741   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6742   // Overflow detection based on Hacker's delight section 2-13.
6743   if (ideal_opc == Op_SaturatingAddV) {
6744     // res = src1 + src2
6745     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6746     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6747     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6748     vpxor(xtmp1, dst, src1, vlen_enc);
6749     vpxor(xtmp2, dst, src2, vlen_enc);
6750     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6751   } else {
6752     assert(ideal_opc == Op_SaturatingSubV, "");
6753     // res = src1 - src2
6754     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6755     // Overflow occurs when both inputs have opposite polarity and
6756     // result polarity does not comply with first input polarity.
6757     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6758     vpxor(xtmp1, src1, src2, vlen_enc);
6759     vpxor(xtmp2, dst, src1, vlen_enc);
6760     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6761   }
6762 
6763   // Compute overflow detection mask.
6764   evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6765   // Note: xtmp1 hold -1 in all its lanes after above call.
6766 
6767   // Compute mask based on first input polarity.
6768   evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6769 
6770   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6771   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6772 
6773   // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6774   // set bits in first input polarity mask holds a min value.
6775   evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6776   // Blend destination lanes with saturated values using overflow detection mask.
6777   evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6778 }
6779 
6780 
6781 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6782                                                         XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6783                                                         XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6784   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6785   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6786   // Overflow detection based on Hacker's delight section 2-13.
6787   if (ideal_opc == Op_SaturatingAddV) {
6788     // res = src1 + src2
6789     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6790     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6791     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6792     vpxor(xtmp1, dst, src1, vlen_enc);
6793     vpxor(xtmp2, dst, src2, vlen_enc);
6794     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6795   } else {
6796     assert(ideal_opc == Op_SaturatingSubV, "");
6797     // res = src1 - src2
6798     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6799     // Overflow occurs when both inputs have opposite polarity and
6800     // result polarity does not comply with first input polarity.
6801     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6802     vpxor(xtmp1, src1, src2, vlen_enc);
6803     vpxor(xtmp2, dst, src1, vlen_enc);
6804     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6805   }
6806 
6807   // Sign-extend to compute overflow detection mask.
6808   vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
6809 
6810   vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
6811   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
6812   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6813 
6814   // Compose saturating min/max vector using first input polarity mask.
6815   vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
6816   vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
6817 
6818   // Blend result with saturating vector using overflow detection mask.
6819   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6820 }
6821 
6822 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6823   switch(elem_bt) {
6824     case T_BYTE:
6825       if (ideal_opc == Op_SaturatingAddV) {
6826         vpaddsb(dst, src1, src2, vlen_enc);
6827       } else {
6828         assert(ideal_opc == Op_SaturatingSubV, "");
6829         vpsubsb(dst, src1, src2, vlen_enc);
6830       }
6831       break;
6832     case T_SHORT:
6833       if (ideal_opc == Op_SaturatingAddV) {
6834         vpaddsw(dst, src1, src2, vlen_enc);
6835       } else {
6836         assert(ideal_opc == Op_SaturatingSubV, "");
6837         vpsubsw(dst, src1, src2, vlen_enc);
6838       }
6839       break;
6840     default:
6841       fatal("Unsupported type %s", type2name(elem_bt));
6842       break;
6843   }
6844 }
6845 
6846 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6847   switch(elem_bt) {
6848     case T_BYTE:
6849       if (ideal_opc == Op_SaturatingAddV) {
6850         vpaddusb(dst, src1, src2, vlen_enc);
6851       } else {
6852         assert(ideal_opc == Op_SaturatingSubV, "");
6853         vpsubusb(dst, src1, src2, vlen_enc);
6854       }
6855       break;
6856     case T_SHORT:
6857       if (ideal_opc == Op_SaturatingAddV) {
6858         vpaddusw(dst, src1, src2, vlen_enc);
6859       } else {
6860         assert(ideal_opc == Op_SaturatingSubV, "");
6861         vpsubusw(dst, src1, src2, vlen_enc);
6862       }
6863       break;
6864     default:
6865       fatal("Unsupported type %s", type2name(elem_bt));
6866       break;
6867   }
6868 }
6869 
6870 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6871                                                      XMMRegister src2, int vlen_enc) {
6872   switch(elem_bt) {
6873     case T_BYTE:
6874       evpermi2b(dst, src1, src2, vlen_enc);
6875       break;
6876     case T_SHORT:
6877       evpermi2w(dst, src1, src2, vlen_enc);
6878       break;
6879     case T_INT:
6880       evpermi2d(dst, src1, src2, vlen_enc);
6881       break;
6882     case T_LONG:
6883       evpermi2q(dst, src1, src2, vlen_enc);
6884       break;
6885     case T_FLOAT:
6886       evpermi2ps(dst, src1, src2, vlen_enc);
6887       break;
6888     case T_DOUBLE:
6889       evpermi2pd(dst, src1, src2, vlen_enc);
6890       break;
6891     default:
6892       fatal("Unsupported type %s", type2name(elem_bt));
6893       break;
6894   }
6895 }
6896 
6897 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
6898   if (is_unsigned) {
6899     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6900   } else {
6901     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6902   }
6903 }
6904 
6905 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
6906   if (is_unsigned) {
6907     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6908   } else {
6909     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6910   }
6911 }
6912 
6913 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6914   switch(opcode) {
6915     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
6916     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
6917     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
6918     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
6919     default: assert(false, "%s", NodeClassNames[opcode]); break;
6920   }
6921 }
6922 
6923 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6924   switch(opcode) {
6925     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
6926     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
6927     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
6928     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
6929     default: assert(false, "%s", NodeClassNames[opcode]); break;
6930   }
6931 }
6932 
6933 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6934                                             KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) {
6935   vector_max_min_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit);
6936 }
6937 
6938 void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6939                                             KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6940   if (opcode == Op_MaxVHF || opcode == Op_MaxHF) {
6941     // Move sign bits of src2 to mask register.
6942     evpmovw2m(ktmp, src2, vlen_enc);
6943     // xtmp1 = src2 < 0 ? src2 : src1
6944     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
6945     // xtmp2 = src2 < 0 ? ? src1 : src2
6946     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
6947     // Idea behind above swapping is to make seconds source operand a +ve value.
6948     // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in
6949     // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction,
6950     // the second source operand, either a NaN or a valid floating-point value, is returned
6951     // dst = max(xtmp1, xtmp2)
6952     evmaxph(dst, xtmp1, xtmp2, vlen_enc);
6953     // isNaN = is_unordered_quiet(xtmp1)
6954     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
6955     // Final result is same as first source if its a NaN value,
6956     // in case second operand holds a NaN value then as per above semantics
6957     // result is same as second operand.
6958     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
6959   } else {
6960     assert(opcode == Op_MinVHF || opcode == Op_MinHF, "");
6961     // Move sign bits of src1 to mask register.
6962     evpmovw2m(ktmp, src1, vlen_enc);
6963     // xtmp1 = src1 < 0 ? src2 : src1
6964     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
6965     // xtmp2 = src1 < 0 ? src1 : src2
6966     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
6967     // Idea behind above swapping is to make seconds source operand a -ve value.
6968     // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in
6969     // the second source operand is returned.
6970     // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN
6971     // or a valid floating-point value, is written to the result.
6972     // dst = min(xtmp1, xtmp2)
6973     evminph(dst, xtmp1, xtmp2, vlen_enc);
6974     // isNaN = is_unordered_quiet(xtmp1)
6975     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
6976     // Final result is same as first source if its a NaN value,
6977     // in case second operand holds a NaN value then as per above semantics
6978     // result is same as second operand.
6979     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
6980   }
6981 }