1 /*
   2  * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "gc/shared/barrierSet.hpp"
  28 #include "gc/shared/barrierSetAssembler.hpp"
  29 #include "oops/methodData.hpp"
  30 #include "opto/c2_MacroAssembler.hpp"
  31 #include "opto/intrinsicnode.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/opcodes.hpp"
  34 #include "opto/subnode.hpp"
  35 #include "runtime/globals.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 #include "utilities/checkedCast.hpp"
  39 #include "utilities/globalDefinitions.hpp"
  40 #include "utilities/powerOfTwo.hpp"
  41 #include "utilities/sizes.hpp"
  42 
  43 #ifdef PRODUCT
  44 #define BLOCK_COMMENT(str) /* nothing */
  45 #define STOP(error) stop(error)
  46 #else
  47 #define BLOCK_COMMENT(str) block_comment(str)
  48 #define STOP(error) block_comment(error); stop(error)
  49 #endif
  50 
  51 // C2 compiled method's prolog code.
  52 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
  53   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  54 
  55   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  56   // Remove word for return addr
  57   framesize -= wordSize;
  58   stack_bang_size -= wordSize;
  59 
  60   // Calls to C2R adapters often do not accept exceptional returns.
  61   // We require that their callers must bang for them.  But be careful, because
  62   // some VM calls (such as call site linkage) can use several kilobytes of
  63   // stack.  But the stack safety zone should account for that.
  64   // See bugs 4446381, 4468289, 4497237.
  65   if (stack_bang_size > 0) {
  66     generate_stack_overflow_check(stack_bang_size);
  67 
  68     // We always push rbp, so that on return to interpreter rbp, will be
  69     // restored correctly and we can correct the stack.
  70     push(rbp);
  71     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  72     if (PreserveFramePointer) {
  73       mov(rbp, rsp);
  74     }
  75     // Remove word for ebp
  76     framesize -= wordSize;
  77 
  78     // Create frame
  79     if (framesize) {
  80       subptr(rsp, framesize);
  81     }
  82   } else {
  83     subptr(rsp, framesize);
  84 
  85     // Save RBP register now.
  86     framesize -= wordSize;
  87     movptr(Address(rsp, framesize), rbp);
  88     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  89     if (PreserveFramePointer) {
  90       movptr(rbp, rsp);
  91       if (framesize > 0) {
  92         addptr(rbp, framesize);
  93       }
  94     }
  95   }
  96 
  97   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
  98     framesize -= wordSize;
  99     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 100   }
 101 
 102 #ifdef ASSERT
 103   if (VerifyStackAtCalls) {
 104     Label L;
 105     push(rax);
 106     mov(rax, rsp);
 107     andptr(rax, StackAlignmentInBytes-1);
 108     cmpptr(rax, StackAlignmentInBytes-wordSize);
 109     pop(rax);
 110     jcc(Assembler::equal, L);
 111     STOP("Stack is not properly aligned!");
 112     bind(L);
 113   }
 114 #endif
 115 
 116   if (!is_stub) {
 117     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 118     // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 119     Label dummy_slow_path;
 120     Label dummy_continuation;
 121     Label* slow_path = &dummy_slow_path;
 122     Label* continuation = &dummy_continuation;
 123     if (!Compile::current()->output()->in_scratch_emit_size()) {
 124       // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 125       C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 126       Compile::current()->output()->add_stub(stub);
 127       slow_path = &stub->entry();
 128       continuation = &stub->continuation();
 129     }
 130     bs->nmethod_entry_barrier(this, slow_path, continuation);
 131   }
 132 }
 133 
 134 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 135   switch (vlen_in_bytes) {
 136     case  4: // fall-through
 137     case  8: // fall-through
 138     case 16: return Assembler::AVX_128bit;
 139     case 32: return Assembler::AVX_256bit;
 140     case 64: return Assembler::AVX_512bit;
 141 
 142     default: {
 143       ShouldNotReachHere();
 144       return Assembler::AVX_NoVec;
 145     }
 146   }
 147 }
 148 
 149 // fast_lock and fast_unlock used by C2
 150 
 151 // Because the transitions from emitted code to the runtime
 152 // monitorenter/exit helper stubs are so slow it's critical that
 153 // we inline both the stack-locking fast path and the inflated fast path.
 154 //
 155 // See also: cmpFastLock and cmpFastUnlock.
 156 //
 157 // What follows is a specialized inline transliteration of the code
 158 // in enter() and exit(). If we're concerned about I$ bloat another
 159 // option would be to emit TrySlowEnter and TrySlowExit methods
 160 // at startup-time.  These methods would accept arguments as
 161 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 162 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 163 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 164 // In practice, however, the # of lock sites is bounded and is usually small.
 165 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 166 // if the processor uses simple bimodal branch predictors keyed by EIP
 167 // Since the helper routines would be called from multiple synchronization
 168 // sites.
 169 //
 170 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 171 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 172 // to those specialized methods.  That'd give us a mostly platform-independent
 173 // implementation that the JITs could optimize and inline at their pleasure.
 174 // Done correctly, the only time we'd need to cross to native could would be
 175 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 176 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 177 // (b) explicit barriers or fence operations.
 178 //
 179 // TODO:
 180 //
 181 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 182 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 183 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 184 //    the lock operators would typically be faster than reifying Self.
 185 //
 186 // *  Ideally I'd define the primitives as:
 187 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 188 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 189 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 190 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 191 //    Furthermore the register assignments are overconstrained, possibly resulting in
 192 //    sub-optimal code near the synchronization site.
 193 //
 194 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 195 //    Alternately, use a better sp-proximity test.
 196 //
 197 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 198 //    Either one is sufficient to uniquely identify a thread.
 199 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 200 //
 201 // *  Intrinsify notify() and notifyAll() for the common cases where the
 202 //    object is locked by the calling thread but the waitlist is empty.
 203 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 204 //
 205 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 206 //    But beware of excessive branch density on AMD Opterons.
 207 //
 208 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 209 //    or failure of the fast path.  If the fast path fails then we pass
 210 //    control to the slow path, typically in C.  In fast_lock and
 211 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 212 //    will emit a conditional branch immediately after the node.
 213 //    So we have branches to branches and lots of ICC.ZF games.
 214 //    Instead, it might be better to have C2 pass a "FailureLabel"
 215 //    into fast_lock and fast_unlock.  In the case of success, control
 216 //    will drop through the node.  ICC.ZF is undefined at exit.
 217 //    In the case of failure, the node will branch directly to the
 218 //    FailureLabel
 219 
 220 
 221 // obj: object to lock
 222 // box: on-stack box address -- KILLED
 223 // rax: tmp -- KILLED
 224 // t  : tmp -- KILLED
 225 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register rax_reg,
 226                                   Register t, Register thread) {
 227   assert(rax_reg == rax, "Used for CAS");
 228   assert_different_registers(obj, box, rax_reg, t, thread);
 229 
 230   // Handle inflated monitor.
 231   Label inflated;
 232   // Finish fast lock successfully. ZF value is irrelevant.
 233   Label locked;
 234   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 235   Label slow_path;
 236 
 237   if (UseObjectMonitorTable) {
 238     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 239     movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
 240   }
 241 
 242   if (DiagnoseSyncOnValueBasedClasses != 0) {
 243     load_klass(rax_reg, obj, t);
 244     testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 245     jcc(Assembler::notZero, slow_path);
 246   }
 247 
 248   const Register mark = t;
 249 
 250   { // Fast Lock
 251 
 252     Label push;
 253 
 254     const Register top = UseObjectMonitorTable ? rax_reg : box;
 255 
 256     // Load the mark.
 257     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 258 
 259     // Prefetch top.
 260     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 261 
 262     // Check for monitor (0b10).
 263     testptr(mark, markWord::monitor_value);
 264     jcc(Assembler::notZero, inflated);
 265 
 266     // Check if lock-stack is full.
 267     cmpl(top, LockStack::end_offset() - 1);
 268     jcc(Assembler::greater, slow_path);
 269 
 270     // Check if recursive.
 271     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 272     jccb(Assembler::equal, push);
 273 
 274     // Try to lock. Transition lock bits 0b01 => 0b00
 275     movptr(rax_reg, mark);
 276     orptr(rax_reg, markWord::unlocked_value);
 277     andptr(mark, ~(int32_t)markWord::unlocked_value);
 278     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 279     jcc(Assembler::notEqual, slow_path);
 280 
 281     if (UseObjectMonitorTable) {
 282       // Need to reload top, clobbered by CAS.
 283       movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 284     }
 285     bind(push);
 286     // After successful lock, push object on lock-stack.
 287     movptr(Address(thread, top), obj);
 288     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 289     jmpb(locked);
 290   }
 291 
 292   { // Handle inflated monitor.
 293     bind(inflated);
 294 
 295     const Register monitor = t;
 296 
 297     if (!UseObjectMonitorTable) {
 298       assert(mark == monitor, "should be the same here");
 299     } else {
 300       // Uses ObjectMonitorTable.  Look for the monitor in the om_cache.
 301       // Fetch ObjectMonitor* from the cache or take the slow-path.
 302       Label monitor_found;
 303 
 304       // Load cache address
 305       lea(t, Address(thread, JavaThread::om_cache_oops_offset()));
 306 
 307       const int num_unrolled = 2;
 308       for (int i = 0; i < num_unrolled; i++) {
 309         cmpptr(obj, Address(t));
 310         jccb(Assembler::equal, monitor_found);
 311         increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 312       }
 313 
 314       Label loop;
 315 
 316       // Search for obj in cache.
 317       bind(loop);
 318 
 319       // Check for match.
 320       cmpptr(obj, Address(t));
 321       jccb(Assembler::equal, monitor_found);
 322 
 323       // Search until null encountered, guaranteed _null_sentinel at end.
 324       cmpptr(Address(t), 1);
 325       jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0
 326       increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 327       jmpb(loop);
 328 
 329       // Cache hit.
 330       bind(monitor_found);
 331       movptr(monitor, Address(t, OMCache::oop_to_monitor_difference()));
 332     }
 333     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 334     const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 335     const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
 336 
 337     Label monitor_locked;
 338     // Lock the monitor.
 339 
 340     if (UseObjectMonitorTable) {
 341       // Cache the monitor for unlock before trashing box. On failure to acquire
 342       // the lock, the slow path will reset the entry accordingly (see CacheSetter).
 343       movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
 344     }
 345 
 346     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 347     xorptr(rax_reg, rax_reg);
 348     movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
 349     lock(); cmpxchgptr(box, owner_address);
 350     jccb(Assembler::equal, monitor_locked);
 351 
 352     // Check if recursive.
 353     cmpptr(box, rax_reg);
 354     jccb(Assembler::notEqual, slow_path);
 355 
 356     // Recursive.
 357     increment(recursions_address);
 358 
 359     bind(monitor_locked);
 360   }
 361 
 362   bind(locked);
 363   // Set ZF = 1
 364   xorl(rax_reg, rax_reg);
 365 
 366 #ifdef ASSERT
 367   // Check that locked label is reached with ZF set.
 368   Label zf_correct;
 369   Label zf_bad_zero;
 370   jcc(Assembler::zero, zf_correct);
 371   jmp(zf_bad_zero);
 372 #endif
 373 
 374   bind(slow_path);
 375 #ifdef ASSERT
 376   // Check that slow_path label is reached with ZF not set.
 377   jcc(Assembler::notZero, zf_correct);
 378   stop("Fast Lock ZF != 0");
 379   bind(zf_bad_zero);
 380   stop("Fast Lock ZF != 1");
 381   bind(zf_correct);
 382 #endif
 383   // C2 uses the value of ZF to determine the continuation.
 384 }
 385 
 386 // obj: object to lock
 387 // rax: tmp -- KILLED
 388 // t  : tmp - cannot be obj nor rax -- KILLED
 389 //
 390 // Some commentary on balanced locking:
 391 //
 392 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 393 // Methods that don't have provably balanced locking are forced to run in the
 394 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 395 // The interpreter provides two properties:
 396 // I1:  At return-time the interpreter automatically and quietly unlocks any
 397 //      objects acquired in the current activation (frame).  Recall that the
 398 //      interpreter maintains an on-stack list of locks currently held by
 399 //      a frame.
 400 // I2:  If a method attempts to unlock an object that is not held by the
 401 //      frame the interpreter throws IMSX.
 402 //
 403 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 404 // B() doesn't have provably balanced locking so it runs in the interpreter.
 405 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 406 // is still locked by A().
 407 //
 408 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface
 409 // Specification" states that an object locked by JNI's MonitorEnter should not be
 410 // unlocked by "normal" java-level locking and vice-versa.  The specification doesn't
 411 // specify what will occur if a program engages in such mixed-mode locking, however.
 412 // Arguably given that the spec legislates the JNI case as undefined our implementation
 413 // could reasonably *avoid* checking owner in fast_unlock().
 414 // In the interest of performance we elide m->Owner==Self check in unlock.
 415 // A perfectly viable alternative is to elide the owner check except when
 416 // Xcheck:jni is enabled.
 417 
 418 void C2_MacroAssembler::fast_unlock(Register obj, Register reg_rax, Register t, Register thread) {
 419   assert(reg_rax == rax, "Used for CAS");
 420   assert_different_registers(obj, reg_rax, t);
 421 
 422   // Handle inflated monitor.
 423   Label inflated, inflated_check_lock_stack;
 424   // Finish fast unlock successfully.  MUST jump with ZF == 1
 425   Label unlocked, slow_path;
 426 
 427   const Register mark = t;
 428   const Register monitor = t;
 429   const Register top = UseObjectMonitorTable ? t : reg_rax;
 430   const Register box = reg_rax;
 431 
 432   Label dummy;
 433   C2FastUnlockStub* stub = nullptr;
 434 
 435   if (!Compile::current()->output()->in_scratch_emit_size()) {
 436     stub = new (Compile::current()->comp_arena()) C2FastUnlockStub(obj, mark, reg_rax, thread);
 437     Compile::current()->output()->add_stub(stub);
 438   }
 439 
 440   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 441 
 442   { // Fast Unlock
 443 
 444     // Load top.
 445     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 446 
 447     if (!UseObjectMonitorTable) {
 448       // Prefetch mark.
 449       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 450     }
 451 
 452     // Check if obj is top of lock-stack.
 453     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 454     // Top of lock stack was not obj. Must be monitor.
 455     jcc(Assembler::notEqual, inflated_check_lock_stack);
 456 
 457     // Pop lock-stack.
 458     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 459     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 460 
 461     // Check if recursive.
 462     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 463     jcc(Assembler::equal, unlocked);
 464 
 465     // We elide the monitor check, let the CAS fail instead.
 466 
 467     if (UseObjectMonitorTable) {
 468       // Load mark.
 469       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 470     }
 471 
 472     // Try to unlock. Transition lock bits 0b00 => 0b01
 473     movptr(reg_rax, mark);
 474     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
 475     orptr(mark, markWord::unlocked_value);
 476     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 477     jcc(Assembler::notEqual, push_and_slow_path);
 478     jmp(unlocked);
 479   }
 480 
 481 
 482   { // Handle inflated monitor.
 483     bind(inflated_check_lock_stack);
 484 #ifdef ASSERT
 485     Label check_done;
 486     subl(top, oopSize);
 487     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
 488     jcc(Assembler::below, check_done);
 489     cmpptr(obj, Address(thread, top));
 490     jccb(Assembler::notEqual, inflated_check_lock_stack);
 491     stop("Fast Unlock lock on stack");
 492     bind(check_done);
 493     if (UseObjectMonitorTable) {
 494       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 495     }
 496     testptr(mark, markWord::monitor_value);
 497     jccb(Assembler::notZero, inflated);
 498     stop("Fast Unlock not monitor");
 499 #endif
 500 
 501     bind(inflated);
 502 
 503     if (!UseObjectMonitorTable) {
 504       assert(mark == monitor, "should be the same here");
 505     } else {
 506       // Uses ObjectMonitorTable.  Look for the monitor in our BasicLock on the stack.
 507       movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 508       // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
 509       cmpptr(monitor, alignof(ObjectMonitor*));
 510       jcc(Assembler::below, slow_path);
 511     }
 512     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 513     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
 514     const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
 515     const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag};
 516     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
 517 
 518     Label recursive;
 519 
 520     // Check if recursive.
 521     cmpptr(recursions_address, 0);
 522     jccb(Assembler::notZero, recursive);
 523 
 524     // Set owner to null.
 525     // Release to satisfy the JMM
 526     movptr(owner_address, NULL_WORD);
 527     // We need a full fence after clearing owner to avoid stranding.
 528     // StoreLoad achieves this.
 529     membar(StoreLoad);
 530 
 531     // Check if the entry_list is empty.
 532     cmpptr(entry_list_address, NULL_WORD);
 533     jccb(Assembler::zero, unlocked);    // If so we are done.
 534 
 535     // Check if there is a successor.
 536     cmpptr(succ_address, NULL_WORD);
 537     jccb(Assembler::notZero, unlocked); // If so we are done.
 538 
 539     // Save the monitor pointer in the current thread, so we can try to
 540     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 541     if (!UseObjectMonitorTable) {
 542       andptr(monitor, ~(int32_t)markWord::monitor_value);
 543     }
 544     movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
 545 
 546     orl(t, 1); // Fast Unlock ZF = 0
 547     jmpb(slow_path);
 548 
 549     // Recursive unlock.
 550     bind(recursive);
 551     decrement(recursions_address);
 552   }
 553 
 554   bind(unlocked);
 555   xorl(t, t); // Fast Unlock ZF = 1
 556 
 557 #ifdef ASSERT
 558   // Check that unlocked label is reached with ZF set.
 559   Label zf_correct;
 560   Label zf_bad_zero;
 561   jcc(Assembler::zero, zf_correct);
 562   jmp(zf_bad_zero);
 563 #endif
 564 
 565   bind(slow_path);
 566   if (stub != nullptr) {
 567     bind(stub->slow_path_continuation());
 568   }
 569 #ifdef ASSERT
 570   // Check that stub->continuation() label is reached with ZF not set.
 571   jcc(Assembler::notZero, zf_correct);
 572   stop("Fast Unlock ZF != 0");
 573   bind(zf_bad_zero);
 574   stop("Fast Unlock ZF != 1");
 575   bind(zf_correct);
 576 #endif
 577   // C2 uses the value of ZF to determine the continuation.
 578 }
 579 
 580 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
 581   fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
 582 }
 583 
 584 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) {
 585   const int framesize = Compile::current()->output()->frame_size_in_bytes();
 586   masm->movptr(dst, rsp);
 587   if (framesize > 2 * wordSize) {
 588     masm->addptr(dst, framesize - 2 * wordSize);
 589   }
 590 }
 591 
 592 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
 593   if (PreserveFramePointer) {
 594     // frame pointer is valid
 595 #ifdef ASSERT
 596     // Verify frame pointer value in rbp.
 597     reconstruct_frame_pointer_helper(this, rtmp);
 598     Label L_success;
 599     cmpq(rbp, rtmp);
 600     jccb(Assembler::equal, L_success);
 601     STOP("frame pointer mismatch");
 602     bind(L_success);
 603 #endif // ASSERT
 604   } else {
 605     reconstruct_frame_pointer_helper(this, rbp);
 606   }
 607 }
 608 
 609 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) {
 610   jint lo = t->_lo;
 611   jint hi = t->_hi;
 612   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi);
 613   if (t == TypeInt::INT) {
 614     return;
 615   }
 616 
 617   BLOCK_COMMENT("CastII {");
 618   Label fail;
 619   Label succeed;
 620 
 621   if (lo != min_jint) {
 622     cmpl(val, lo);
 623     jccb(Assembler::less, fail);
 624   }
 625   if (hi != max_jint) {
 626     cmpl(val, hi);
 627     jccb(Assembler::greater, fail);
 628   }
 629   jmpb(succeed);
 630 
 631   bind(fail);
 632   movl(c_rarg0, idx);
 633   movl(c_rarg1, val);
 634   movl(c_rarg2, lo);
 635   movl(c_rarg3, hi);
 636   reconstruct_frame_pointer(rscratch1);
 637   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range)));
 638   hlt();
 639   bind(succeed);
 640   BLOCK_COMMENT("} // CastII");
 641 }
 642 
 643 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
 644   fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
 645 }
 646 
 647 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) {
 648   jlong lo = t->_lo;
 649   jlong hi = t->_hi;
 650   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi);
 651   if (t == TypeLong::LONG) {
 652     return;
 653   }
 654 
 655   BLOCK_COMMENT("CastLL {");
 656   Label fail;
 657   Label succeed;
 658 
 659   auto cmp_val = [&](jlong bound) {
 660     if (is_simm32(bound)) {
 661       cmpq(val, checked_cast<int>(bound));
 662     } else {
 663       mov64(tmp, bound);
 664       cmpq(val, tmp);
 665     }
 666   };
 667 
 668   if (lo != min_jlong) {
 669     cmp_val(lo);
 670     jccb(Assembler::less, fail);
 671   }
 672   if (hi != max_jlong) {
 673     cmp_val(hi);
 674     jccb(Assembler::greater, fail);
 675   }
 676   jmpb(succeed);
 677 
 678   bind(fail);
 679   movl(c_rarg0, idx);
 680   movq(c_rarg1, val);
 681   mov64(c_rarg2, lo);
 682   mov64(c_rarg3, hi);
 683   reconstruct_frame_pointer(rscratch1);
 684   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range)));
 685   hlt();
 686   bind(succeed);
 687   BLOCK_COMMENT("} // CastLL");
 688 }
 689 
 690 //-------------------------------------------------------------------------------------------
 691 // Generic instructions support for use in .ad files C2 code generation
 692 
 693 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 694   if (dst != src) {
 695     movdqu(dst, src);
 696   }
 697   if (opcode == Op_AbsVD) {
 698     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 699   } else {
 700     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 701     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 702   }
 703 }
 704 
 705 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 706   if (opcode == Op_AbsVD) {
 707     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 708   } else {
 709     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 710     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 711   }
 712 }
 713 
 714 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 715   if (dst != src) {
 716     movdqu(dst, src);
 717   }
 718   if (opcode == Op_AbsVF) {
 719     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 720   } else {
 721     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 722     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 723   }
 724 }
 725 
 726 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 727   if (opcode == Op_AbsVF) {
 728     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 729   } else {
 730     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 731     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 732   }
 733 }
 734 
 735 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 736   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 737   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 738 
 739   if (opcode == Op_MinV) {
 740     if (elem_bt == T_BYTE) {
 741       pminsb(dst, src);
 742     } else if (elem_bt == T_SHORT) {
 743       pminsw(dst, src);
 744     } else if (elem_bt == T_INT) {
 745       pminsd(dst, src);
 746     } else {
 747       assert(elem_bt == T_LONG, "required");
 748       assert(tmp == xmm0, "required");
 749       assert_different_registers(dst, src, tmp);
 750       movdqu(xmm0, dst);
 751       pcmpgtq(xmm0, src);
 752       blendvpd(dst, src);  // xmm0 as mask
 753     }
 754   } else { // opcode == Op_MaxV
 755     if (elem_bt == T_BYTE) {
 756       pmaxsb(dst, src);
 757     } else if (elem_bt == T_SHORT) {
 758       pmaxsw(dst, src);
 759     } else if (elem_bt == T_INT) {
 760       pmaxsd(dst, src);
 761     } else {
 762       assert(elem_bt == T_LONG, "required");
 763       assert(tmp == xmm0, "required");
 764       assert_different_registers(dst, src, tmp);
 765       movdqu(xmm0, src);
 766       pcmpgtq(xmm0, dst);
 767       blendvpd(dst, src);  // xmm0 as mask
 768     }
 769   }
 770 }
 771 
 772 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 773                                   XMMRegister src1, Address src2, int vlen_enc) {
 774   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 775   if (opcode == Op_UMinV) {
 776     switch(elem_bt) {
 777       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 778       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 779       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 780       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 781       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 782     }
 783   } else {
 784     assert(opcode == Op_UMaxV, "required");
 785     switch(elem_bt) {
 786       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 787       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 788       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 789       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 790       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 791     }
 792   }
 793 }
 794 
 795 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
 796   // For optimality, leverage a full vector width of 512 bits
 797   // for operations over smaller vector sizes on AVX512 targets.
 798   if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 799     if (opcode == Op_UMaxV) {
 800       evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 801     } else {
 802       assert(opcode == Op_UMinV, "required");
 803       evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 804     }
 805   } else {
 806     // T1 = -1
 807     vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
 808     // T1 = -1 << 63
 809     vpsllq(xtmp1, xtmp1, 63, vlen_enc);
 810     // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
 811     vpaddq(xtmp2, xtmp1, src2, vlen_enc);
 812     // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
 813     vpaddq(xtmp1, xtmp1, src1, vlen_enc);
 814     // Mask = T2 > T1
 815     vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
 816     if (opcode == Op_UMaxV) {
 817       // Res = Mask ? Src2 : Src1
 818       vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
 819     } else {
 820       // Res = Mask ? Src1 : Src2
 821       vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
 822     }
 823   }
 824 }
 825 
 826 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 827                                   XMMRegister src1, XMMRegister src2, int vlen_enc) {
 828   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 829   if (opcode == Op_UMinV) {
 830     switch(elem_bt) {
 831       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 832       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 833       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 834       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 835       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 836     }
 837   } else {
 838     assert(opcode == Op_UMaxV, "required");
 839     switch(elem_bt) {
 840       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 841       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 842       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 843       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 844       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 845     }
 846   }
 847 }
 848 
 849 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 850                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 851                                  int vlen_enc) {
 852   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 853 
 854   if (opcode == Op_MinV) {
 855     if (elem_bt == T_BYTE) {
 856       vpminsb(dst, src1, src2, vlen_enc);
 857     } else if (elem_bt == T_SHORT) {
 858       vpminsw(dst, src1, src2, vlen_enc);
 859     } else if (elem_bt == T_INT) {
 860       vpminsd(dst, src1, src2, vlen_enc);
 861     } else {
 862       assert(elem_bt == T_LONG, "required");
 863       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 864         vpminsq(dst, src1, src2, vlen_enc);
 865       } else {
 866         assert_different_registers(dst, src1, src2);
 867         vpcmpgtq(dst, src1, src2, vlen_enc);
 868         vblendvpd(dst, src1, src2, dst, vlen_enc);
 869       }
 870     }
 871   } else { // opcode == Op_MaxV
 872     if (elem_bt == T_BYTE) {
 873       vpmaxsb(dst, src1, src2, vlen_enc);
 874     } else if (elem_bt == T_SHORT) {
 875       vpmaxsw(dst, src1, src2, vlen_enc);
 876     } else if (elem_bt == T_INT) {
 877       vpmaxsd(dst, src1, src2, vlen_enc);
 878     } else {
 879       assert(elem_bt == T_LONG, "required");
 880       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 881         vpmaxsq(dst, src1, src2, vlen_enc);
 882       } else {
 883         assert_different_registers(dst, src1, src2);
 884         vpcmpgtq(dst, src1, src2, vlen_enc);
 885         vblendvpd(dst, src2, src1, dst, vlen_enc);
 886       }
 887     }
 888   }
 889 }
 890 
 891 // Float/Double min max
 892 
 893 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
 894                                    XMMRegister dst, XMMRegister a, XMMRegister b,
 895                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
 896                                    int vlen_enc) {
 897   assert(UseAVX > 0, "required");
 898   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 899          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 900   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 901   assert_different_registers(a, tmp, atmp, btmp);
 902   assert_different_registers(b, tmp, atmp, btmp);
 903 
 904   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 905   bool is_double_word = is_double_word_type(elem_bt);
 906 
 907   /* Note on 'non-obvious' assembly sequence:
 908    *
 909    * While there are vminps/vmaxps instructions, there are two important differences between hardware
 910    * and Java on how they handle floats:
 911    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
 912    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
 913    *
 914    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
 915    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
 916    *                (only useful when signs differ, noop otherwise)
 917    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
 918 
 919    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
 920    *   btmp = (b < +0.0) ? a : b
 921    *   atmp = (b < +0.0) ? b : a
 922    *   Tmp  = Max_Float(atmp , btmp)
 923    *   Res  = (atmp == NaN) ? atmp : Tmp
 924    */
 925 
 926   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
 927   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
 928   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
 929   XMMRegister mask;
 930 
 931   if (!is_double_word && is_min) {
 932     mask = a;
 933     vblend = &MacroAssembler::vblendvps;
 934     vmaxmin = &MacroAssembler::vminps;
 935     vcmp = &MacroAssembler::vcmpps;
 936   } else if (!is_double_word && !is_min) {
 937     mask = b;
 938     vblend = &MacroAssembler::vblendvps;
 939     vmaxmin = &MacroAssembler::vmaxps;
 940     vcmp = &MacroAssembler::vcmpps;
 941   } else if (is_double_word && is_min) {
 942     mask = a;
 943     vblend = &MacroAssembler::vblendvpd;
 944     vmaxmin = &MacroAssembler::vminpd;
 945     vcmp = &MacroAssembler::vcmppd;
 946   } else {
 947     assert(is_double_word && !is_min, "sanity");
 948     mask = b;
 949     vblend = &MacroAssembler::vblendvpd;
 950     vmaxmin = &MacroAssembler::vmaxpd;
 951     vcmp = &MacroAssembler::vcmppd;
 952   }
 953 
 954   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
 955   XMMRegister maxmin, scratch;
 956   if (dst == btmp) {
 957     maxmin = btmp;
 958     scratch = tmp;
 959   } else {
 960     maxmin = tmp;
 961     scratch = btmp;
 962   }
 963 
 964   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
 965   if (precompute_mask && !is_double_word) {
 966     vpsrad(tmp, mask, 32, vlen_enc);
 967     mask = tmp;
 968   } else if (precompute_mask && is_double_word) {
 969     vpxor(tmp, tmp, tmp, vlen_enc);
 970     vpcmpgtq(tmp, tmp, mask, vlen_enc);
 971     mask = tmp;
 972   }
 973 
 974   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
 975   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
 976   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
 977   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 978   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
 979 }
 980 
 981 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
 982                                     XMMRegister dst, XMMRegister a, XMMRegister b,
 983                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
 984                                     int vlen_enc) {
 985   assert(UseAVX > 2, "required");
 986   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 987          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 988   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 989   assert_different_registers(dst, a, atmp, btmp);
 990   assert_different_registers(dst, b, atmp, btmp);
 991 
 992   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 993   bool is_double_word = is_double_word_type(elem_bt);
 994   bool merge = true;
 995 
 996   if (!is_double_word && is_min) {
 997     evpmovd2m(ktmp, a, vlen_enc);
 998     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
 999     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1000     vminps(dst, atmp, btmp, vlen_enc);
1001     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1002     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1003   } else if (!is_double_word && !is_min) {
1004     evpmovd2m(ktmp, b, vlen_enc);
1005     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1006     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1007     vmaxps(dst, atmp, btmp, vlen_enc);
1008     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1009     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1010   } else if (is_double_word && is_min) {
1011     evpmovq2m(ktmp, a, vlen_enc);
1012     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1013     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1014     vminpd(dst, atmp, btmp, vlen_enc);
1015     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1016     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1017   } else {
1018     assert(is_double_word && !is_min, "sanity");
1019     evpmovq2m(ktmp, b, vlen_enc);
1020     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1021     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1022     vmaxpd(dst, atmp, btmp, vlen_enc);
1023     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1024     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1025   }
1026 }
1027 
1028 void C2_MacroAssembler::vminmax_fp(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1029                                    XMMRegister src1, XMMRegister src2, int vlen_enc) {
1030   assert(opc == Op_MinV || opc == Op_MinReductionV ||
1031          opc == Op_MaxV || opc == Op_MaxReductionV, "sanity");
1032 
1033   int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_2_MINMAX_MIN_COMPARE_SIGN
1034                                                          : AVX10_2_MINMAX_MAX_COMPARE_SIGN;
1035   if (elem_bt == T_FLOAT) {
1036     evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc);
1037   } else {
1038     assert(elem_bt == T_DOUBLE, "");
1039     evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc);
1040   }
1041 }
1042 
1043 // Float/Double signum
1044 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1045   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1046 
1047   Label DONE_LABEL;
1048 
1049   if (opcode == Op_SignumF) {
1050     ucomiss(dst, zero);
1051     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1052     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1053     movflt(dst, one);
1054     jcc(Assembler::above, DONE_LABEL);
1055     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1056   } else if (opcode == Op_SignumD) {
1057     ucomisd(dst, zero);
1058     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1059     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1060     movdbl(dst, one);
1061     jcc(Assembler::above, DONE_LABEL);
1062     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1063   }
1064 
1065   bind(DONE_LABEL);
1066 }
1067 
1068 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1069   if (sign) {
1070     pmovsxbw(dst, src);
1071   } else {
1072     pmovzxbw(dst, src);
1073   }
1074 }
1075 
1076 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1077   if (sign) {
1078     vpmovsxbw(dst, src, vector_len);
1079   } else {
1080     vpmovzxbw(dst, src, vector_len);
1081   }
1082 }
1083 
1084 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1085   if (sign) {
1086     vpmovsxbd(dst, src, vector_len);
1087   } else {
1088     vpmovzxbd(dst, src, vector_len);
1089   }
1090 }
1091 
1092 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1093   if (sign) {
1094     vpmovsxwd(dst, src, vector_len);
1095   } else {
1096     vpmovzxwd(dst, src, vector_len);
1097   }
1098 }
1099 
1100 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1101                                      int shift, int vector_len) {
1102   if (opcode == Op_RotateLeftV) {
1103     if (etype == T_INT) {
1104       evprold(dst, src, shift, vector_len);
1105     } else {
1106       assert(etype == T_LONG, "expected type T_LONG");
1107       evprolq(dst, src, shift, vector_len);
1108     }
1109   } else {
1110     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1111     if (etype == T_INT) {
1112       evprord(dst, src, shift, vector_len);
1113     } else {
1114       assert(etype == T_LONG, "expected type T_LONG");
1115       evprorq(dst, src, shift, vector_len);
1116     }
1117   }
1118 }
1119 
1120 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1121                                      XMMRegister shift, int vector_len) {
1122   if (opcode == Op_RotateLeftV) {
1123     if (etype == T_INT) {
1124       evprolvd(dst, src, shift, vector_len);
1125     } else {
1126       assert(etype == T_LONG, "expected type T_LONG");
1127       evprolvq(dst, src, shift, vector_len);
1128     }
1129   } else {
1130     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1131     if (etype == T_INT) {
1132       evprorvd(dst, src, shift, vector_len);
1133     } else {
1134       assert(etype == T_LONG, "expected type T_LONG");
1135       evprorvq(dst, src, shift, vector_len);
1136     }
1137   }
1138 }
1139 
1140 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1141   if (opcode == Op_RShiftVI) {
1142     psrad(dst, shift);
1143   } else if (opcode == Op_LShiftVI) {
1144     pslld(dst, shift);
1145   } else {
1146     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1147     psrld(dst, shift);
1148   }
1149 }
1150 
1151 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1152   switch (opcode) {
1153     case Op_RShiftVI:  psrad(dst, shift); break;
1154     case Op_LShiftVI:  pslld(dst, shift); break;
1155     case Op_URShiftVI: psrld(dst, shift); break;
1156 
1157     default: assert(false, "%s", NodeClassNames[opcode]);
1158   }
1159 }
1160 
1161 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1162   if (opcode == Op_RShiftVI) {
1163     vpsrad(dst, nds, shift, vector_len);
1164   } else if (opcode == Op_LShiftVI) {
1165     vpslld(dst, nds, shift, vector_len);
1166   } else {
1167     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1168     vpsrld(dst, nds, shift, vector_len);
1169   }
1170 }
1171 
1172 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1173   switch (opcode) {
1174     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1175     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1176     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1177 
1178     default: assert(false, "%s", NodeClassNames[opcode]);
1179   }
1180 }
1181 
1182 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1183   switch (opcode) {
1184     case Op_RShiftVB:  // fall-through
1185     case Op_RShiftVS:  psraw(dst, shift); break;
1186 
1187     case Op_LShiftVB:  // fall-through
1188     case Op_LShiftVS:  psllw(dst, shift);   break;
1189 
1190     case Op_URShiftVS: // fall-through
1191     case Op_URShiftVB: psrlw(dst, shift);  break;
1192 
1193     default: assert(false, "%s", NodeClassNames[opcode]);
1194   }
1195 }
1196 
1197 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1198   switch (opcode) {
1199     case Op_RShiftVB:  // fall-through
1200     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1201 
1202     case Op_LShiftVB:  // fall-through
1203     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1204 
1205     case Op_URShiftVS: // fall-through
1206     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1207 
1208     default: assert(false, "%s", NodeClassNames[opcode]);
1209   }
1210 }
1211 
1212 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1213   switch (opcode) {
1214     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1215     case Op_LShiftVL:  psllq(dst, shift); break;
1216     case Op_URShiftVL: psrlq(dst, shift); break;
1217 
1218     default: assert(false, "%s", NodeClassNames[opcode]);
1219   }
1220 }
1221 
1222 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1223   if (opcode == Op_RShiftVL) {
1224     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1225   } else if (opcode == Op_LShiftVL) {
1226     psllq(dst, shift);
1227   } else {
1228     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1229     psrlq(dst, shift);
1230   }
1231 }
1232 
1233 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1234   switch (opcode) {
1235     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1236     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1237     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1238 
1239     default: assert(false, "%s", NodeClassNames[opcode]);
1240   }
1241 }
1242 
1243 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1244   if (opcode == Op_RShiftVL) {
1245     evpsraq(dst, nds, shift, vector_len);
1246   } else if (opcode == Op_LShiftVL) {
1247     vpsllq(dst, nds, shift, vector_len);
1248   } else {
1249     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1250     vpsrlq(dst, nds, shift, vector_len);
1251   }
1252 }
1253 
1254 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1255   switch (opcode) {
1256     case Op_RShiftVB:  // fall-through
1257     case Op_RShiftVS:  // fall-through
1258     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1259 
1260     case Op_LShiftVB:  // fall-through
1261     case Op_LShiftVS:  // fall-through
1262     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1263 
1264     case Op_URShiftVB: // fall-through
1265     case Op_URShiftVS: // fall-through
1266     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1267 
1268     default: assert(false, "%s", NodeClassNames[opcode]);
1269   }
1270 }
1271 
1272 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1273   switch (opcode) {
1274     case Op_RShiftVB:  // fall-through
1275     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1276 
1277     case Op_LShiftVB:  // fall-through
1278     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1279 
1280     case Op_URShiftVB: // fall-through
1281     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1282 
1283     default: assert(false, "%s", NodeClassNames[opcode]);
1284   }
1285 }
1286 
1287 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1288   assert(UseAVX >= 2, "required");
1289   switch (opcode) {
1290     case Op_RShiftVL: {
1291       if (UseAVX > 2) {
1292         assert(tmp == xnoreg, "not used");
1293         if (!VM_Version::supports_avx512vl()) {
1294           vlen_enc = Assembler::AVX_512bit;
1295         }
1296         evpsravq(dst, src, shift, vlen_enc);
1297       } else {
1298         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1299         vpsrlvq(dst, src, shift, vlen_enc);
1300         vpsrlvq(tmp, tmp, shift, vlen_enc);
1301         vpxor(dst, dst, tmp, vlen_enc);
1302         vpsubq(dst, dst, tmp, vlen_enc);
1303       }
1304       break;
1305     }
1306     case Op_LShiftVL: {
1307       assert(tmp == xnoreg, "not used");
1308       vpsllvq(dst, src, shift, vlen_enc);
1309       break;
1310     }
1311     case Op_URShiftVL: {
1312       assert(tmp == xnoreg, "not used");
1313       vpsrlvq(dst, src, shift, vlen_enc);
1314       break;
1315     }
1316     default: assert(false, "%s", NodeClassNames[opcode]);
1317   }
1318 }
1319 
1320 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1321 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1322   assert(opcode == Op_LShiftVB ||
1323          opcode == Op_RShiftVB ||
1324          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1325   bool sign = (opcode != Op_URShiftVB);
1326   assert(vector_len == 0, "required");
1327   vextendbd(sign, dst, src, 1);
1328   vpmovzxbd(vtmp, shift, 1);
1329   varshiftd(opcode, dst, dst, vtmp, 1);
1330   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1331   vextracti128_high(vtmp, dst);
1332   vpackusdw(dst, dst, vtmp, 0);
1333 }
1334 
1335 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1336 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1337   assert(opcode == Op_LShiftVB ||
1338          opcode == Op_RShiftVB ||
1339          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1340   bool sign = (opcode != Op_URShiftVB);
1341   int ext_vector_len = vector_len + 1;
1342   vextendbw(sign, dst, src, ext_vector_len);
1343   vpmovzxbw(vtmp, shift, ext_vector_len);
1344   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1345   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1346   if (vector_len == 0) {
1347     vextracti128_high(vtmp, dst);
1348     vpackuswb(dst, dst, vtmp, vector_len);
1349   } else {
1350     vextracti64x4_high(vtmp, dst);
1351     vpackuswb(dst, dst, vtmp, vector_len);
1352     vpermq(dst, dst, 0xD8, vector_len);
1353   }
1354 }
1355 
1356 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1357   switch(typ) {
1358     case T_BYTE:
1359       pinsrb(dst, val, idx);
1360       break;
1361     case T_SHORT:
1362       pinsrw(dst, val, idx);
1363       break;
1364     case T_INT:
1365       pinsrd(dst, val, idx);
1366       break;
1367     case T_LONG:
1368       pinsrq(dst, val, idx);
1369       break;
1370     default:
1371       assert(false,"Should not reach here.");
1372       break;
1373   }
1374 }
1375 
1376 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1377   switch(typ) {
1378     case T_BYTE:
1379       vpinsrb(dst, src, val, idx);
1380       break;
1381     case T_SHORT:
1382       vpinsrw(dst, src, val, idx);
1383       break;
1384     case T_INT:
1385       vpinsrd(dst, src, val, idx);
1386       break;
1387     case T_LONG:
1388       vpinsrq(dst, src, val, idx);
1389       break;
1390     default:
1391       assert(false,"Should not reach here.");
1392       break;
1393   }
1394 }
1395 
1396 void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst,
1397                                          Register base, Register idx_base,
1398                                          Register mask, Register mask_idx,
1399                                          Register rtmp, int vlen_enc) {
1400   vpxor(dst, dst, dst, vlen_enc);
1401   if (elem_bt == T_SHORT) {
1402     for (int i = 0; i < 4; i++) {
1403       // dst[i] = mask[i] ? src[idx_base[i]] : 0
1404       Label skip_load;
1405       btq(mask, mask_idx);
1406       jccb(Assembler::carryClear, skip_load);
1407       movl(rtmp, Address(idx_base, i * 4));
1408       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1409       bind(skip_load);
1410       incq(mask_idx);
1411     }
1412   } else {
1413     assert(elem_bt == T_BYTE, "");
1414     for (int i = 0; i < 8; i++) {
1415       // dst[i] = mask[i] ? src[idx_base[i]] : 0
1416       Label skip_load;
1417       btq(mask, mask_idx);
1418       jccb(Assembler::carryClear, skip_load);
1419       movl(rtmp, Address(idx_base, i * 4));
1420       pinsrb(dst, Address(base, rtmp), i);
1421       bind(skip_load);
1422       incq(mask_idx);
1423     }
1424   }
1425 }
1426 
1427 void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst,
1428                                   Register base, Register idx_base,
1429                                   Register rtmp, int vlen_enc) {
1430   vpxor(dst, dst, dst, vlen_enc);
1431   if (elem_bt == T_SHORT) {
1432     for (int i = 0; i < 4; i++) {
1433       // dst[i] = src[idx_base[i]]
1434       movl(rtmp, Address(idx_base, i * 4));
1435       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1436     }
1437   } else {
1438     assert(elem_bt == T_BYTE, "");
1439     for (int i = 0; i < 8; i++) {
1440       // dst[i] = src[idx_base[i]]
1441       movl(rtmp, Address(idx_base, i * 4));
1442       pinsrb(dst, Address(base, rtmp), i);
1443     }
1444   }
1445 }
1446 
1447 /*
1448  * Gather using hybrid algorithm, first partially unroll scalar loop
1449  * to accumulate values from gather indices into a quad-word(64bit) slice.
1450  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1451  * permutation to place the slice into appropriate vector lane
1452  * locations in destination vector. Following pseudo code describes the
1453  * algorithm in detail:
1454  *
1455  * DST_VEC = ZERO_VEC
1456  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1457  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1458  * FOREACH_ITER:
1459  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1460  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1461  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1462  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1463  *
1464  * With each iteration, doubleword permute indices (0,1) corresponding
1465  * to gathered quadword gets right shifted by two lane positions.
1466  *
1467  */
1468 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1469                                         Register base, Register idx_base,
1470                                         Register mask, XMMRegister xtmp1,
1471                                         XMMRegister xtmp2, XMMRegister temp_dst,
1472                                         Register rtmp, Register mask_idx,
1473                                         Register length, int vector_len, int vlen_enc) {
1474   Label GATHER8_LOOP;
1475   assert(is_subword_type(elem_ty), "");
1476   movl(length, vector_len);
1477   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1478   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1479   vallones(xtmp2, vlen_enc);
1480   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1481   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1482   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1483 
1484   bind(GATHER8_LOOP);
1485     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1486     if (mask == noreg) {
1487       vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc);
1488     } else {
1489       vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc);
1490     }
1491     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1492     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1493     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1494     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1495     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1496     vpor(dst, dst, temp_dst, vlen_enc);
1497     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1498     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1499     jcc(Assembler::notEqual, GATHER8_LOOP);
1500 }
1501 
1502 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1503   switch(typ) {
1504     case T_INT:
1505       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1506       break;
1507     case T_FLOAT:
1508       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1509       break;
1510     case T_LONG:
1511       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1512       break;
1513     case T_DOUBLE:
1514       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1515       break;
1516     default:
1517       assert(false,"Should not reach here.");
1518       break;
1519   }
1520 }
1521 
1522 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1523   switch(typ) {
1524     case T_INT:
1525       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1526       break;
1527     case T_FLOAT:
1528       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1529       break;
1530     case T_LONG:
1531       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1532       break;
1533     case T_DOUBLE:
1534       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1535       break;
1536     default:
1537       assert(false,"Should not reach here.");
1538       break;
1539   }
1540 }
1541 
1542 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1543   switch(typ) {
1544     case T_INT:
1545       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1546       break;
1547     case T_FLOAT:
1548       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1549       break;
1550     case T_LONG:
1551       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1552       break;
1553     case T_DOUBLE:
1554       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1555       break;
1556     default:
1557       assert(false,"Should not reach here.");
1558       break;
1559   }
1560 }
1561 
1562 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1563   if (vlen_in_bytes <= 16) {
1564     pxor (dst, dst);
1565     psubb(dst, src);
1566     switch (elem_bt) {
1567       case T_BYTE:   /* nothing to do */ break;
1568       case T_SHORT:  pmovsxbw(dst, dst); break;
1569       case T_INT:    pmovsxbd(dst, dst); break;
1570       case T_FLOAT:  pmovsxbd(dst, dst); break;
1571       case T_LONG:   pmovsxbq(dst, dst); break;
1572       case T_DOUBLE: pmovsxbq(dst, dst); break;
1573 
1574       default: assert(false, "%s", type2name(elem_bt));
1575     }
1576   } else {
1577     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1578     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1579 
1580     vpxor (dst, dst, dst, vlen_enc);
1581     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1582 
1583     switch (elem_bt) {
1584       case T_BYTE:   /* nothing to do */            break;
1585       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1586       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1587       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1588       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1589       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1590 
1591       default: assert(false, "%s", type2name(elem_bt));
1592     }
1593   }
1594 }
1595 
1596 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1597   if (novlbwdq) {
1598     vpmovsxbd(xtmp, src, vlen_enc);
1599     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1600             Assembler::eq, true, vlen_enc, noreg);
1601   } else {
1602     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1603     vpsubb(xtmp, xtmp, src, vlen_enc);
1604     evpmovb2m(dst, xtmp, vlen_enc);
1605   }
1606 }
1607 
1608 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1609   if (is_integral_type(bt)) {
1610     switch (vlen_in_bytes) {
1611       case 4:  movdl(dst, src);   break;
1612       case 8:  movq(dst, src);    break;
1613       case 16: movdqu(dst, src);  break;
1614       case 32: vmovdqu(dst, src); break;
1615       case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1616       default: ShouldNotReachHere();
1617     }
1618   } else {
1619     switch (vlen_in_bytes) {
1620       case 4:  movflt(dst, src); break;
1621       case 8:  movdbl(dst, src); break;
1622       case 16: movups(dst, src); break;
1623       case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1624       case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1625       default: ShouldNotReachHere();
1626     }
1627   }
1628 }
1629 
1630 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1631   assert(rscratch != noreg || always_reachable(src), "missing");
1632 
1633   if (reachable(src)) {
1634     load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1635   } else {
1636     lea(rscratch, src);
1637     load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1638   }
1639 }
1640 
1641 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1642   int vlen_enc = vector_length_encoding(vlen);
1643   if (VM_Version::supports_avx()) {
1644     if (bt == T_LONG) {
1645       if (VM_Version::supports_avx2()) {
1646         vpbroadcastq(dst, src, vlen_enc);
1647       } else {
1648         vmovddup(dst, src, vlen_enc);
1649       }
1650     } else if (bt == T_DOUBLE) {
1651       if (vlen_enc != Assembler::AVX_128bit) {
1652         vbroadcastsd(dst, src, vlen_enc, noreg);
1653       } else {
1654         vmovddup(dst, src, vlen_enc);
1655       }
1656     } else {
1657       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1658         vpbroadcastd(dst, src, vlen_enc);
1659       } else {
1660         vbroadcastss(dst, src, vlen_enc);
1661       }
1662     }
1663   } else if (VM_Version::supports_sse3()) {
1664     movddup(dst, src);
1665   } else {
1666     load_vector(bt, dst, src, vlen);
1667   }
1668 }
1669 
1670 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1671   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1672   int offset = exact_log2(type2aelembytes(bt)) << 6;
1673   if (is_floating_point_type(bt)) {
1674     offset += 128;
1675   }
1676   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1677   load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1678 }
1679 
1680 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1681 
1682 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1683   int vector_len = Assembler::AVX_128bit;
1684 
1685   switch (opcode) {
1686     case Op_AndReductionV:  pand(dst, src); break;
1687     case Op_OrReductionV:   por (dst, src); break;
1688     case Op_XorReductionV:  pxor(dst, src); break;
1689     case Op_MinReductionV:
1690       switch (typ) {
1691         case T_BYTE:        pminsb(dst, src); break;
1692         case T_SHORT:       pminsw(dst, src); break;
1693         case T_INT:         pminsd(dst, src); break;
1694         case T_LONG:        assert(UseAVX > 2, "required");
1695                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1696         default:            assert(false, "wrong type");
1697       }
1698       break;
1699     case Op_MaxReductionV:
1700       switch (typ) {
1701         case T_BYTE:        pmaxsb(dst, src); break;
1702         case T_SHORT:       pmaxsw(dst, src); break;
1703         case T_INT:         pmaxsd(dst, src); break;
1704         case T_LONG:        assert(UseAVX > 2, "required");
1705                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1706         default:            assert(false, "wrong type");
1707       }
1708       break;
1709     case Op_AddReductionVF: addss(dst, src); break;
1710     case Op_AddReductionVD: addsd(dst, src); break;
1711     case Op_AddReductionVI:
1712       switch (typ) {
1713         case T_BYTE:        paddb(dst, src); break;
1714         case T_SHORT:       paddw(dst, src); break;
1715         case T_INT:         paddd(dst, src); break;
1716         default:            assert(false, "wrong type");
1717       }
1718       break;
1719     case Op_AddReductionVL: paddq(dst, src); break;
1720     case Op_MulReductionVF: mulss(dst, src); break;
1721     case Op_MulReductionVD: mulsd(dst, src); break;
1722     case Op_MulReductionVI:
1723       switch (typ) {
1724         case T_SHORT:       pmullw(dst, src); break;
1725         case T_INT:         pmulld(dst, src); break;
1726         default:            assert(false, "wrong type");
1727       }
1728       break;
1729     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1730                             evpmullq(dst, dst, src, vector_len); break;
1731     default:                assert(false, "wrong opcode");
1732   }
1733 }
1734 
1735 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1736   switch (opcode) {
1737     case Op_AddReductionVF: addps(dst, src); break;
1738     case Op_AddReductionVD: addpd(dst, src); break;
1739     case Op_MulReductionVF: mulps(dst, src); break;
1740     case Op_MulReductionVD: mulpd(dst, src); break;
1741     default:                assert(false, "%s", NodeClassNames[opcode]);
1742   }
1743 }
1744 
1745 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1746   int vector_len = Assembler::AVX_256bit;
1747 
1748   switch (opcode) {
1749     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1750     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1751     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1752     case Op_MinReductionV:
1753       switch (typ) {
1754         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1755         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1756         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1757         case T_LONG:        assert(UseAVX > 2, "required");
1758                             vpminsq(dst, src1, src2, vector_len); break;
1759         default:            assert(false, "wrong type");
1760       }
1761       break;
1762     case Op_MaxReductionV:
1763       switch (typ) {
1764         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1765         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1766         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1767         case T_LONG:        assert(UseAVX > 2, "required");
1768                             vpmaxsq(dst, src1, src2, vector_len); break;
1769         default:            assert(false, "wrong type");
1770       }
1771       break;
1772     case Op_AddReductionVI:
1773       switch (typ) {
1774         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1775         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1776         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1777         default:            assert(false, "wrong type");
1778       }
1779       break;
1780     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1781     case Op_MulReductionVI:
1782       switch (typ) {
1783         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1784         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1785         default:            assert(false, "wrong type");
1786       }
1787       break;
1788     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1789     default:                assert(false, "wrong opcode");
1790   }
1791 }
1792 
1793 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1794   int vector_len = Assembler::AVX_256bit;
1795 
1796   switch (opcode) {
1797     case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1798     case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1799     case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1800     case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1801     default:                assert(false, "%s", NodeClassNames[opcode]);
1802   }
1803 }
1804 
1805 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1806                                   XMMRegister dst, XMMRegister src,
1807                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1808   switch (opcode) {
1809     case Op_AddReductionVF:
1810     case Op_MulReductionVF:
1811       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1812       break;
1813 
1814     case Op_AddReductionVD:
1815     case Op_MulReductionVD:
1816       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1817       break;
1818 
1819     default: assert(false, "wrong opcode");
1820   }
1821 }
1822 
1823 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1824                                             XMMRegister dst, XMMRegister src,
1825                                             XMMRegister vtmp1, XMMRegister vtmp2) {
1826   switch (opcode) {
1827     case Op_AddReductionVF:
1828     case Op_MulReductionVF:
1829       unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1830       break;
1831 
1832     case Op_AddReductionVD:
1833     case Op_MulReductionVD:
1834       unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1835       break;
1836 
1837     default: assert(false, "%s", NodeClassNames[opcode]);
1838   }
1839 }
1840 
1841 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1842                              Register dst, Register src1, XMMRegister src2,
1843                              XMMRegister vtmp1, XMMRegister vtmp2) {
1844   switch (vlen) {
1845     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1846     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1847     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1848     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1849 
1850     default: assert(false, "wrong vector length");
1851   }
1852 }
1853 
1854 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1855                              Register dst, Register src1, XMMRegister src2,
1856                              XMMRegister vtmp1, XMMRegister vtmp2) {
1857   switch (vlen) {
1858     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1859     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1860     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1861     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1862 
1863     default: assert(false, "wrong vector length");
1864   }
1865 }
1866 
1867 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1868                              Register dst, Register src1, XMMRegister src2,
1869                              XMMRegister vtmp1, XMMRegister vtmp2) {
1870   switch (vlen) {
1871     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1872     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1873     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1874     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1875 
1876     default: assert(false, "wrong vector length");
1877   }
1878 }
1879 
1880 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1881                              Register dst, Register src1, XMMRegister src2,
1882                              XMMRegister vtmp1, XMMRegister vtmp2) {
1883   switch (vlen) {
1884     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1885     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1886     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1887     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1888 
1889     default: assert(false, "wrong vector length");
1890   }
1891 }
1892 
1893 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1894                              Register dst, Register src1, XMMRegister src2,
1895                              XMMRegister vtmp1, XMMRegister vtmp2) {
1896   switch (vlen) {
1897     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1898     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1899     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1900 
1901     default: assert(false, "wrong vector length");
1902   }
1903 }
1904 
1905 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1906   switch (vlen) {
1907     case 2:
1908       assert(vtmp2 == xnoreg, "");
1909       reduce2F(opcode, dst, src, vtmp1);
1910       break;
1911     case 4:
1912       assert(vtmp2 == xnoreg, "");
1913       reduce4F(opcode, dst, src, vtmp1);
1914       break;
1915     case 8:
1916       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1917       break;
1918     case 16:
1919       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1920       break;
1921     default: assert(false, "wrong vector length");
1922   }
1923 }
1924 
1925 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1926   switch (vlen) {
1927     case 2:
1928       assert(vtmp2 == xnoreg, "");
1929       reduce2D(opcode, dst, src, vtmp1);
1930       break;
1931     case 4:
1932       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1933       break;
1934     case 8:
1935       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1936       break;
1937     default: assert(false, "wrong vector length");
1938   }
1939 }
1940 
1941 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1942   switch (vlen) {
1943     case 2:
1944       assert(vtmp1 == xnoreg, "");
1945       assert(vtmp2 == xnoreg, "");
1946       unorderedReduce2F(opcode, dst, src);
1947       break;
1948     case 4:
1949       assert(vtmp2 == xnoreg, "");
1950       unorderedReduce4F(opcode, dst, src, vtmp1);
1951       break;
1952     case 8:
1953       unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
1954       break;
1955     case 16:
1956       unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
1957       break;
1958     default: assert(false, "wrong vector length");
1959   }
1960 }
1961 
1962 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1963   switch (vlen) {
1964     case 2:
1965       assert(vtmp1 == xnoreg, "");
1966       assert(vtmp2 == xnoreg, "");
1967       unorderedReduce2D(opcode, dst, src);
1968       break;
1969     case 4:
1970       assert(vtmp2 == xnoreg, "");
1971       unorderedReduce4D(opcode, dst, src, vtmp1);
1972       break;
1973     case 8:
1974       unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
1975       break;
1976     default: assert(false, "wrong vector length");
1977   }
1978 }
1979 
1980 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1981   if (opcode == Op_AddReductionVI) {
1982     if (vtmp1 != src2) {
1983       movdqu(vtmp1, src2);
1984     }
1985     phaddd(vtmp1, vtmp1);
1986   } else {
1987     pshufd(vtmp1, src2, 0x1);
1988     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1989   }
1990   movdl(vtmp2, src1);
1991   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1992   movdl(dst, vtmp1);
1993 }
1994 
1995 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1996   if (opcode == Op_AddReductionVI) {
1997     if (vtmp1 != src2) {
1998       movdqu(vtmp1, src2);
1999     }
2000     phaddd(vtmp1, src2);
2001     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2002   } else {
2003     pshufd(vtmp2, src2, 0xE);
2004     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2005     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2006   }
2007 }
2008 
2009 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2010   if (opcode == Op_AddReductionVI) {
2011     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2012     vextracti128_high(vtmp2, vtmp1);
2013     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2014     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2015   } else {
2016     vextracti128_high(vtmp1, src2);
2017     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2018     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2019   }
2020 }
2021 
2022 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2023   vextracti64x4_high(vtmp2, src2);
2024   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2025   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2026 }
2027 
2028 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2029   pshufd(vtmp2, src2, 0x1);
2030   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2031   movdqu(vtmp1, vtmp2);
2032   psrldq(vtmp1, 2);
2033   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2034   movdqu(vtmp2, vtmp1);
2035   psrldq(vtmp2, 1);
2036   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2037   movdl(vtmp2, src1);
2038   pmovsxbd(vtmp1, vtmp1);
2039   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2040   pextrb(dst, vtmp1, 0x0);
2041   movsbl(dst, dst);
2042 }
2043 
2044 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2045   pshufd(vtmp1, src2, 0xE);
2046   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2047   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2048 }
2049 
2050 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2051   vextracti128_high(vtmp2, src2);
2052   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2053   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2054 }
2055 
2056 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2057   vextracti64x4_high(vtmp1, src2);
2058   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2059   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2060 }
2061 
2062 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2063   pmovsxbw(vtmp2, src2);
2064   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2065 }
2066 
2067 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2068   if (UseAVX > 1) {
2069     int vector_len = Assembler::AVX_256bit;
2070     vpmovsxbw(vtmp1, src2, vector_len);
2071     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2072   } else {
2073     pmovsxbw(vtmp2, src2);
2074     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2075     pshufd(vtmp2, src2, 0x1);
2076     pmovsxbw(vtmp2, src2);
2077     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2078   }
2079 }
2080 
2081 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2082   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2083     int vector_len = Assembler::AVX_512bit;
2084     vpmovsxbw(vtmp1, src2, vector_len);
2085     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2086   } else {
2087     assert(UseAVX >= 2,"Should not reach here.");
2088     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2089     vextracti128_high(vtmp2, src2);
2090     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2091   }
2092 }
2093 
2094 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2095   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2096   vextracti64x4_high(vtmp2, src2);
2097   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2098 }
2099 
2100 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2101   if (opcode == Op_AddReductionVI) {
2102     if (vtmp1 != src2) {
2103       movdqu(vtmp1, src2);
2104     }
2105     phaddw(vtmp1, vtmp1);
2106     phaddw(vtmp1, vtmp1);
2107   } else {
2108     pshufd(vtmp2, src2, 0x1);
2109     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2110     movdqu(vtmp1, vtmp2);
2111     psrldq(vtmp1, 2);
2112     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2113   }
2114   movdl(vtmp2, src1);
2115   pmovsxwd(vtmp1, vtmp1);
2116   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2117   pextrw(dst, vtmp1, 0x0);
2118   movswl(dst, dst);
2119 }
2120 
2121 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2122   if (opcode == Op_AddReductionVI) {
2123     if (vtmp1 != src2) {
2124       movdqu(vtmp1, src2);
2125     }
2126     phaddw(vtmp1, src2);
2127   } else {
2128     pshufd(vtmp1, src2, 0xE);
2129     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2130   }
2131   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2132 }
2133 
2134 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2135   if (opcode == Op_AddReductionVI) {
2136     int vector_len = Assembler::AVX_256bit;
2137     vphaddw(vtmp2, src2, src2, vector_len);
2138     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2139   } else {
2140     vextracti128_high(vtmp2, src2);
2141     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2142   }
2143   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2144 }
2145 
2146 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2147   int vector_len = Assembler::AVX_256bit;
2148   vextracti64x4_high(vtmp1, src2);
2149   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2150   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2151 }
2152 
2153 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2154   pshufd(vtmp2, src2, 0xE);
2155   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2156   movdq(vtmp1, src1);
2157   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2158   movdq(dst, vtmp1);
2159 }
2160 
2161 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2162   vextracti128_high(vtmp1, src2);
2163   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2164   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2165 }
2166 
2167 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2168   vextracti64x4_high(vtmp2, src2);
2169   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2170   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2171 }
2172 
2173 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2174   mov64(temp, -1L);
2175   bzhiq(temp, temp, len);
2176   kmovql(dst, temp);
2177 }
2178 
2179 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2180   reduce_operation_128(T_FLOAT, opcode, dst, src);
2181   pshufd(vtmp, src, 0x1);
2182   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2183 }
2184 
2185 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2186   reduce2F(opcode, dst, src, vtmp);
2187   pshufd(vtmp, src, 0x2);
2188   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2189   pshufd(vtmp, src, 0x3);
2190   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2191 }
2192 
2193 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2194   reduce4F(opcode, dst, src, vtmp2);
2195   vextractf128_high(vtmp2, src);
2196   reduce4F(opcode, dst, vtmp2, vtmp1);
2197 }
2198 
2199 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2200   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2201   vextracti64x4_high(vtmp1, src);
2202   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2203 }
2204 
2205 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2206   pshufd(dst, src, 0x1);
2207   reduce_operation_128(T_FLOAT, opcode, dst, src);
2208 }
2209 
2210 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2211   pshufd(vtmp, src, 0xE);
2212   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2213   unorderedReduce2F(opcode, dst, vtmp);
2214 }
2215 
2216 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2217   vextractf128_high(vtmp1, src);
2218   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2219   unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2220 }
2221 
2222 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2223   vextractf64x4_high(vtmp2, src);
2224   unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2225   unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2226 }
2227 
2228 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2229   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2230   pshufd(vtmp, src, 0xE);
2231   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2232 }
2233 
2234 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2235   reduce2D(opcode, dst, src, vtmp2);
2236   vextractf128_high(vtmp2, src);
2237   reduce2D(opcode, dst, vtmp2, vtmp1);
2238 }
2239 
2240 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2241   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2242   vextracti64x4_high(vtmp1, src);
2243   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2244 }
2245 
2246 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2247   pshufd(dst, src, 0xE);
2248   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2249 }
2250 
2251 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2252   vextractf128_high(vtmp, src);
2253   unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2254   unorderedReduce2D(opcode, dst, vtmp);
2255 }
2256 
2257 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2258   vextractf64x4_high(vtmp2, src);
2259   unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2260   unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2261 }
2262 
2263 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2264   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2265 }
2266 
2267 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2268   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2269 }
2270 
2271 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2272   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2273 }
2274 
2275 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2276                                  int vec_enc) {
2277   switch(elem_bt) {
2278     case T_INT:
2279     case T_FLOAT:
2280       vmaskmovps(dst, src, mask, vec_enc);
2281       break;
2282     case T_LONG:
2283     case T_DOUBLE:
2284       vmaskmovpd(dst, src, mask, vec_enc);
2285       break;
2286     default:
2287       fatal("Unsupported type %s", type2name(elem_bt));
2288       break;
2289   }
2290 }
2291 
2292 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2293                                  int vec_enc) {
2294   switch(elem_bt) {
2295     case T_INT:
2296     case T_FLOAT:
2297       vmaskmovps(dst, src, mask, vec_enc);
2298       break;
2299     case T_LONG:
2300     case T_DOUBLE:
2301       vmaskmovpd(dst, src, mask, vec_enc);
2302       break;
2303     default:
2304       fatal("Unsupported type %s", type2name(elem_bt));
2305       break;
2306   }
2307 }
2308 
2309 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2310                                           XMMRegister dst, XMMRegister src,
2311                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2312                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2313   const int permconst[] = {1, 14};
2314   XMMRegister wsrc = src;
2315   XMMRegister wdst = xmm_0;
2316   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2317 
2318   int vlen_enc = Assembler::AVX_128bit;
2319   if (vlen == 16) {
2320     vlen_enc = Assembler::AVX_256bit;
2321   }
2322 
2323   for (int i = log2(vlen) - 1; i >=0; i--) {
2324     if (i == 0 && !is_dst_valid) {
2325       wdst = dst;
2326     }
2327     if (i == 3) {
2328       vextracti64x4_high(wtmp, wsrc);
2329     } else if (i == 2) {
2330       vextracti128_high(wtmp, wsrc);
2331     } else { // i = [0,1]
2332       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2333     }
2334 
2335     if (VM_Version::supports_avx10_2()) {
2336       vminmax_fp(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc);
2337     } else {
2338       vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2339     }
2340     wsrc = wdst;
2341     vlen_enc = Assembler::AVX_128bit;
2342   }
2343   if (is_dst_valid) {
2344     if (VM_Version::supports_avx10_2()) {
2345       vminmax_fp(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit);
2346     } else {
2347       vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2348     }
2349   }
2350 }
2351 
2352 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2353                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2354                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2355   XMMRegister wsrc = src;
2356   XMMRegister wdst = xmm_0;
2357   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2358   int vlen_enc = Assembler::AVX_128bit;
2359   if (vlen == 8) {
2360     vlen_enc = Assembler::AVX_256bit;
2361   }
2362   for (int i = log2(vlen) - 1; i >=0; i--) {
2363     if (i == 0 && !is_dst_valid) {
2364       wdst = dst;
2365     }
2366     if (i == 1) {
2367       vextracti128_high(wtmp, wsrc);
2368     } else if (i == 2) {
2369       vextracti64x4_high(wtmp, wsrc);
2370     } else {
2371       assert(i == 0, "%d", i);
2372       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2373     }
2374 
2375     if (VM_Version::supports_avx10_2()) {
2376       vminmax_fp(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc);
2377     } else {
2378       vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2379     }
2380 
2381     wsrc = wdst;
2382     vlen_enc = Assembler::AVX_128bit;
2383   }
2384 
2385   if (is_dst_valid) {
2386     if (VM_Version::supports_avx10_2()) {
2387       vminmax_fp(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit);
2388     } else {
2389       vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2390     }
2391   }
2392 }
2393 
2394 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2395   switch (bt) {
2396     case T_BYTE:  pextrb(dst, src, idx); break;
2397     case T_SHORT: pextrw(dst, src, idx); break;
2398     case T_INT:   pextrd(dst, src, idx); break;
2399     case T_LONG:  pextrq(dst, src, idx); break;
2400 
2401     default:
2402       assert(false,"Should not reach here.");
2403       break;
2404   }
2405 }
2406 
2407 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2408   int esize =  type2aelembytes(typ);
2409   int elem_per_lane = 16/esize;
2410   int lane = elemindex / elem_per_lane;
2411   int eindex = elemindex % elem_per_lane;
2412 
2413   if (lane >= 2) {
2414     assert(UseAVX > 2, "required");
2415     vextractf32x4(dst, src, lane & 3);
2416     return dst;
2417   } else if (lane > 0) {
2418     assert(UseAVX > 0, "required");
2419     vextractf128(dst, src, lane);
2420     return dst;
2421   } else {
2422     return src;
2423   }
2424 }
2425 
2426 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2427   if (typ == T_BYTE) {
2428     movsbl(dst, dst);
2429   } else if (typ == T_SHORT) {
2430     movswl(dst, dst);
2431   }
2432 }
2433 
2434 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2435   int esize =  type2aelembytes(typ);
2436   int elem_per_lane = 16/esize;
2437   int eindex = elemindex % elem_per_lane;
2438   assert(is_integral_type(typ),"required");
2439 
2440   if (eindex == 0) {
2441     if (typ == T_LONG) {
2442       movq(dst, src);
2443     } else {
2444       movdl(dst, src);
2445       movsxl(typ, dst);
2446     }
2447   } else {
2448     extract(typ, dst, src, eindex);
2449     movsxl(typ, dst);
2450   }
2451 }
2452 
2453 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2454   int esize =  type2aelembytes(typ);
2455   int elem_per_lane = 16/esize;
2456   int eindex = elemindex % elem_per_lane;
2457   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2458 
2459   if (eindex == 0) {
2460     movq(dst, src);
2461   } else {
2462     if (typ == T_FLOAT) {
2463       if (UseAVX == 0) {
2464         movdqu(dst, src);
2465         shufps(dst, dst, eindex);
2466       } else {
2467         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2468       }
2469     } else {
2470       if (UseAVX == 0) {
2471         movdqu(dst, src);
2472         psrldq(dst, eindex*esize);
2473       } else {
2474         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2475       }
2476       movq(dst, dst);
2477     }
2478   }
2479   // Zero upper bits
2480   if (typ == T_FLOAT) {
2481     if (UseAVX == 0) {
2482       assert(vtmp != xnoreg, "required.");
2483       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2484       pand(dst, vtmp);
2485     } else {
2486       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2487     }
2488   }
2489 }
2490 
2491 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2492   switch(typ) {
2493     case T_BYTE:
2494     case T_BOOLEAN:
2495       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2496       break;
2497     case T_SHORT:
2498     case T_CHAR:
2499       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2500       break;
2501     case T_INT:
2502     case T_FLOAT:
2503       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2504       break;
2505     case T_LONG:
2506     case T_DOUBLE:
2507       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2508       break;
2509     default:
2510       assert(false,"Should not reach here.");
2511       break;
2512   }
2513 }
2514 
2515 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2516   assert(rscratch != noreg || always_reachable(src2), "missing");
2517 
2518   switch(typ) {
2519     case T_BOOLEAN:
2520     case T_BYTE:
2521       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2522       break;
2523     case T_CHAR:
2524     case T_SHORT:
2525       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2526       break;
2527     case T_INT:
2528     case T_FLOAT:
2529       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2530       break;
2531     case T_LONG:
2532     case T_DOUBLE:
2533       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2534       break;
2535     default:
2536       assert(false,"Should not reach here.");
2537       break;
2538   }
2539 }
2540 
2541 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2542   switch(typ) {
2543     case T_BYTE:
2544       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2545       break;
2546     case T_SHORT:
2547       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2548       break;
2549     case T_INT:
2550     case T_FLOAT:
2551       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2552       break;
2553     case T_LONG:
2554     case T_DOUBLE:
2555       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2556       break;
2557     default:
2558       assert(false,"Should not reach here.");
2559       break;
2560   }
2561 }
2562 
2563 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2564   assert(vlen_in_bytes <= 32, "");
2565   int esize = type2aelembytes(bt);
2566   if (vlen_in_bytes == 32) {
2567     assert(vtmp == xnoreg, "required.");
2568     if (esize >= 4) {
2569       vtestps(src1, src2, AVX_256bit);
2570     } else {
2571       vptest(src1, src2, AVX_256bit);
2572     }
2573     return;
2574   }
2575   if (vlen_in_bytes < 16) {
2576     // Duplicate the lower part to fill the whole register,
2577     // Don't need to do so for src2
2578     assert(vtmp != xnoreg, "required");
2579     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2580     pshufd(vtmp, src1, shuffle_imm);
2581   } else {
2582     assert(vtmp == xnoreg, "required");
2583     vtmp = src1;
2584   }
2585   if (esize >= 4 && VM_Version::supports_avx()) {
2586     vtestps(vtmp, src2, AVX_128bit);
2587   } else {
2588     ptest(vtmp, src2);
2589   }
2590 }
2591 
2592 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2593 #ifdef ASSERT
2594   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2595   bool is_bw_supported = VM_Version::supports_avx512bw();
2596   if (is_bw && !is_bw_supported) {
2597     assert(vlen_enc != Assembler::AVX_512bit, "required");
2598     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2599            "XMM register should be 0-15");
2600   }
2601 #endif // ASSERT
2602   switch (elem_bt) {
2603     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2604     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2605     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2606     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2607     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2608     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2609     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2610   }
2611 }
2612 
2613 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2614   assert(UseAVX >= 2, "required");
2615   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2616   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2617   if ((UseAVX > 2) &&
2618       (!is_bw || VM_Version::supports_avx512bw()) &&
2619       (!is_vl || VM_Version::supports_avx512vl())) {
2620     switch (elem_bt) {
2621       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2622       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2623       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2624       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2625       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2626     }
2627   } else {
2628     assert(vlen_enc != Assembler::AVX_512bit, "required");
2629     assert((dst->encoding() < 16),"XMM register should be 0-15");
2630     switch (elem_bt) {
2631       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2632       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2633       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2634       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2635       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2636       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2637       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2638     }
2639   }
2640 }
2641 
2642 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2643   switch (to_elem_bt) {
2644     case T_SHORT:
2645       vpmovsxbw(dst, src, vlen_enc);
2646       break;
2647     case T_INT:
2648       vpmovsxbd(dst, src, vlen_enc);
2649       break;
2650     case T_FLOAT:
2651       vpmovsxbd(dst, src, vlen_enc);
2652       vcvtdq2ps(dst, dst, vlen_enc);
2653       break;
2654     case T_LONG:
2655       vpmovsxbq(dst, src, vlen_enc);
2656       break;
2657     case T_DOUBLE: {
2658       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2659       vpmovsxbd(dst, src, mid_vlen_enc);
2660       vcvtdq2pd(dst, dst, vlen_enc);
2661       break;
2662     }
2663     default:
2664       fatal("Unsupported type %s", type2name(to_elem_bt));
2665       break;
2666   }
2667 }
2668 
2669 //-------------------------------------------------------------------------------------------
2670 
2671 // IndexOf for constant substrings with size >= 8 chars
2672 // which don't need to be loaded through stack.
2673 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2674                                          Register cnt1, Register cnt2,
2675                                          int int_cnt2,  Register result,
2676                                          XMMRegister vec, Register tmp,
2677                                          int ae) {
2678   ShortBranchVerifier sbv(this);
2679   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2680   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2681 
2682   // This method uses the pcmpestri instruction with bound registers
2683   //   inputs:
2684   //     xmm - substring
2685   //     rax - substring length (elements count)
2686   //     mem - scanned string
2687   //     rdx - string length (elements count)
2688   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2689   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2690   //   outputs:
2691   //     rcx - matched index in string
2692   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2693   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2694   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2695   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2696   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2697 
2698   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2699         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2700         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2701 
2702   // Note, inline_string_indexOf() generates checks:
2703   // if (substr.count > string.count) return -1;
2704   // if (substr.count == 0) return 0;
2705   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2706 
2707   // Load substring.
2708   if (ae == StrIntrinsicNode::UL) {
2709     pmovzxbw(vec, Address(str2, 0));
2710   } else {
2711     movdqu(vec, Address(str2, 0));
2712   }
2713   movl(cnt2, int_cnt2);
2714   movptr(result, str1); // string addr
2715 
2716   if (int_cnt2 > stride) {
2717     jmpb(SCAN_TO_SUBSTR);
2718 
2719     // Reload substr for rescan, this code
2720     // is executed only for large substrings (> 8 chars)
2721     bind(RELOAD_SUBSTR);
2722     if (ae == StrIntrinsicNode::UL) {
2723       pmovzxbw(vec, Address(str2, 0));
2724     } else {
2725       movdqu(vec, Address(str2, 0));
2726     }
2727     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2728 
2729     bind(RELOAD_STR);
2730     // We came here after the beginning of the substring was
2731     // matched but the rest of it was not so we need to search
2732     // again. Start from the next element after the previous match.
2733 
2734     // cnt2 is number of substring reminding elements and
2735     // cnt1 is number of string reminding elements when cmp failed.
2736     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2737     subl(cnt1, cnt2);
2738     addl(cnt1, int_cnt2);
2739     movl(cnt2, int_cnt2); // Now restore cnt2
2740 
2741     decrementl(cnt1);     // Shift to next element
2742     cmpl(cnt1, cnt2);
2743     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2744 
2745     addptr(result, (1<<scale1));
2746 
2747   } // (int_cnt2 > 8)
2748 
2749   // Scan string for start of substr in 16-byte vectors
2750   bind(SCAN_TO_SUBSTR);
2751   pcmpestri(vec, Address(result, 0), mode);
2752   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2753   subl(cnt1, stride);
2754   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2755   cmpl(cnt1, cnt2);
2756   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2757   addptr(result, 16);
2758   jmpb(SCAN_TO_SUBSTR);
2759 
2760   // Found a potential substr
2761   bind(FOUND_CANDIDATE);
2762   // Matched whole vector if first element matched (tmp(rcx) == 0).
2763   if (int_cnt2 == stride) {
2764     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2765   } else { // int_cnt2 > 8
2766     jccb(Assembler::overflow, FOUND_SUBSTR);
2767   }
2768   // After pcmpestri tmp(rcx) contains matched element index
2769   // Compute start addr of substr
2770   lea(result, Address(result, tmp, scale1));
2771 
2772   // Make sure string is still long enough
2773   subl(cnt1, tmp);
2774   cmpl(cnt1, cnt2);
2775   if (int_cnt2 == stride) {
2776     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2777   } else { // int_cnt2 > 8
2778     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2779   }
2780   // Left less then substring.
2781 
2782   bind(RET_NOT_FOUND);
2783   movl(result, -1);
2784   jmp(EXIT);
2785 
2786   if (int_cnt2 > stride) {
2787     // This code is optimized for the case when whole substring
2788     // is matched if its head is matched.
2789     bind(MATCH_SUBSTR_HEAD);
2790     pcmpestri(vec, Address(result, 0), mode);
2791     // Reload only string if does not match
2792     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2793 
2794     Label CONT_SCAN_SUBSTR;
2795     // Compare the rest of substring (> 8 chars).
2796     bind(FOUND_SUBSTR);
2797     // First 8 chars are already matched.
2798     negptr(cnt2);
2799     addptr(cnt2, stride);
2800 
2801     bind(SCAN_SUBSTR);
2802     subl(cnt1, stride);
2803     cmpl(cnt2, -stride); // Do not read beyond substring
2804     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2805     // Back-up strings to avoid reading beyond substring:
2806     // cnt1 = cnt1 - cnt2 + 8
2807     addl(cnt1, cnt2); // cnt2 is negative
2808     addl(cnt1, stride);
2809     movl(cnt2, stride); negptr(cnt2);
2810     bind(CONT_SCAN_SUBSTR);
2811     if (int_cnt2 < (int)G) {
2812       int tail_off1 = int_cnt2<<scale1;
2813       int tail_off2 = int_cnt2<<scale2;
2814       if (ae == StrIntrinsicNode::UL) {
2815         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2816       } else {
2817         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2818       }
2819       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2820     } else {
2821       // calculate index in register to avoid integer overflow (int_cnt2*2)
2822       movl(tmp, int_cnt2);
2823       addptr(tmp, cnt2);
2824       if (ae == StrIntrinsicNode::UL) {
2825         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2826       } else {
2827         movdqu(vec, Address(str2, tmp, scale2, 0));
2828       }
2829       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2830     }
2831     // Need to reload strings pointers if not matched whole vector
2832     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2833     addptr(cnt2, stride);
2834     jcc(Assembler::negative, SCAN_SUBSTR);
2835     // Fall through if found full substring
2836 
2837   } // (int_cnt2 > 8)
2838 
2839   bind(RET_FOUND);
2840   // Found result if we matched full small substring.
2841   // Compute substr offset
2842   subptr(result, str1);
2843   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2844     shrl(result, 1); // index
2845   }
2846   bind(EXIT);
2847 
2848 } // string_indexofC8
2849 
2850 // Small strings are loaded through stack if they cross page boundary.
2851 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2852                                        Register cnt1, Register cnt2,
2853                                        int int_cnt2,  Register result,
2854                                        XMMRegister vec, Register tmp,
2855                                        int ae) {
2856   ShortBranchVerifier sbv(this);
2857   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2858   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2859 
2860   //
2861   // int_cnt2 is length of small (< 8 chars) constant substring
2862   // or (-1) for non constant substring in which case its length
2863   // is in cnt2 register.
2864   //
2865   // Note, inline_string_indexOf() generates checks:
2866   // if (substr.count > string.count) return -1;
2867   // if (substr.count == 0) return 0;
2868   //
2869   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2870   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2871   // This method uses the pcmpestri instruction with bound registers
2872   //   inputs:
2873   //     xmm - substring
2874   //     rax - substring length (elements count)
2875   //     mem - scanned string
2876   //     rdx - string length (elements count)
2877   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2878   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2879   //   outputs:
2880   //     rcx - matched index in string
2881   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2882   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2883   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2884   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2885 
2886   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2887         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2888         FOUND_CANDIDATE;
2889 
2890   { //========================================================
2891     // We don't know where these strings are located
2892     // and we can't read beyond them. Load them through stack.
2893     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2894 
2895     movptr(tmp, rsp); // save old SP
2896 
2897     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2898       if (int_cnt2 == (1>>scale2)) { // One byte
2899         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2900         load_unsigned_byte(result, Address(str2, 0));
2901         movdl(vec, result); // move 32 bits
2902       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2903         // Not enough header space in 32-bit VM: 12+3 = 15.
2904         movl(result, Address(str2, -1));
2905         shrl(result, 8);
2906         movdl(vec, result); // move 32 bits
2907       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2908         load_unsigned_short(result, Address(str2, 0));
2909         movdl(vec, result); // move 32 bits
2910       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2911         movdl(vec, Address(str2, 0)); // move 32 bits
2912       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2913         movq(vec, Address(str2, 0));  // move 64 bits
2914       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2915         // Array header size is 12 bytes in 32-bit VM
2916         // + 6 bytes for 3 chars == 18 bytes,
2917         // enough space to load vec and shift.
2918         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2919         if (ae == StrIntrinsicNode::UL) {
2920           int tail_off = int_cnt2-8;
2921           pmovzxbw(vec, Address(str2, tail_off));
2922           psrldq(vec, -2*tail_off);
2923         }
2924         else {
2925           int tail_off = int_cnt2*(1<<scale2);
2926           movdqu(vec, Address(str2, tail_off-16));
2927           psrldq(vec, 16-tail_off);
2928         }
2929       }
2930     } else { // not constant substring
2931       cmpl(cnt2, stride);
2932       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2933 
2934       // We can read beyond string if srt+16 does not cross page boundary
2935       // since heaps are aligned and mapped by pages.
2936       assert(os::vm_page_size() < (int)G, "default page should be small");
2937       movl(result, str2); // We need only low 32 bits
2938       andl(result, ((int)os::vm_page_size()-1));
2939       cmpl(result, ((int)os::vm_page_size()-16));
2940       jccb(Assembler::belowEqual, CHECK_STR);
2941 
2942       // Move small strings to stack to allow load 16 bytes into vec.
2943       subptr(rsp, 16);
2944       int stk_offset = wordSize-(1<<scale2);
2945       push(cnt2);
2946 
2947       bind(COPY_SUBSTR);
2948       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2949         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2950         movb(Address(rsp, cnt2, scale2, stk_offset), result);
2951       } else if (ae == StrIntrinsicNode::UU) {
2952         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2953         movw(Address(rsp, cnt2, scale2, stk_offset), result);
2954       }
2955       decrement(cnt2);
2956       jccb(Assembler::notZero, COPY_SUBSTR);
2957 
2958       pop(cnt2);
2959       movptr(str2, rsp);  // New substring address
2960     } // non constant
2961 
2962     bind(CHECK_STR);
2963     cmpl(cnt1, stride);
2964     jccb(Assembler::aboveEqual, BIG_STRINGS);
2965 
2966     // Check cross page boundary.
2967     movl(result, str1); // We need only low 32 bits
2968     andl(result, ((int)os::vm_page_size()-1));
2969     cmpl(result, ((int)os::vm_page_size()-16));
2970     jccb(Assembler::belowEqual, BIG_STRINGS);
2971 
2972     subptr(rsp, 16);
2973     int stk_offset = -(1<<scale1);
2974     if (int_cnt2 < 0) { // not constant
2975       push(cnt2);
2976       stk_offset += wordSize;
2977     }
2978     movl(cnt2, cnt1);
2979 
2980     bind(COPY_STR);
2981     if (ae == StrIntrinsicNode::LL) {
2982       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2983       movb(Address(rsp, cnt2, scale1, stk_offset), result);
2984     } else {
2985       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2986       movw(Address(rsp, cnt2, scale1, stk_offset), result);
2987     }
2988     decrement(cnt2);
2989     jccb(Assembler::notZero, COPY_STR);
2990 
2991     if (int_cnt2 < 0) { // not constant
2992       pop(cnt2);
2993     }
2994     movptr(str1, rsp);  // New string address
2995 
2996     bind(BIG_STRINGS);
2997     // Load substring.
2998     if (int_cnt2 < 0) { // -1
2999       if (ae == StrIntrinsicNode::UL) {
3000         pmovzxbw(vec, Address(str2, 0));
3001       } else {
3002         movdqu(vec, Address(str2, 0));
3003       }
3004       push(cnt2);       // substr count
3005       push(str2);       // substr addr
3006       push(str1);       // string addr
3007     } else {
3008       // Small (< 8 chars) constant substrings are loaded already.
3009       movl(cnt2, int_cnt2);
3010     }
3011     push(tmp);  // original SP
3012 
3013   } // Finished loading
3014 
3015   //========================================================
3016   // Start search
3017   //
3018 
3019   movptr(result, str1); // string addr
3020 
3021   if (int_cnt2  < 0) {  // Only for non constant substring
3022     jmpb(SCAN_TO_SUBSTR);
3023 
3024     // SP saved at sp+0
3025     // String saved at sp+1*wordSize
3026     // Substr saved at sp+2*wordSize
3027     // Substr count saved at sp+3*wordSize
3028 
3029     // Reload substr for rescan, this code
3030     // is executed only for large substrings (> 8 chars)
3031     bind(RELOAD_SUBSTR);
3032     movptr(str2, Address(rsp, 2*wordSize));
3033     movl(cnt2, Address(rsp, 3*wordSize));
3034     if (ae == StrIntrinsicNode::UL) {
3035       pmovzxbw(vec, Address(str2, 0));
3036     } else {
3037       movdqu(vec, Address(str2, 0));
3038     }
3039     // We came here after the beginning of the substring was
3040     // matched but the rest of it was not so we need to search
3041     // again. Start from the next element after the previous match.
3042     subptr(str1, result); // Restore counter
3043     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3044       shrl(str1, 1);
3045     }
3046     addl(cnt1, str1);
3047     decrementl(cnt1);   // Shift to next element
3048     cmpl(cnt1, cnt2);
3049     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3050 
3051     addptr(result, (1<<scale1));
3052   } // non constant
3053 
3054   // Scan string for start of substr in 16-byte vectors
3055   bind(SCAN_TO_SUBSTR);
3056   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3057   pcmpestri(vec, Address(result, 0), mode);
3058   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3059   subl(cnt1, stride);
3060   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3061   cmpl(cnt1, cnt2);
3062   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3063   addptr(result, 16);
3064 
3065   bind(ADJUST_STR);
3066   cmpl(cnt1, stride); // Do not read beyond string
3067   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3068   // Back-up string to avoid reading beyond string.
3069   lea(result, Address(result, cnt1, scale1, -16));
3070   movl(cnt1, stride);
3071   jmpb(SCAN_TO_SUBSTR);
3072 
3073   // Found a potential substr
3074   bind(FOUND_CANDIDATE);
3075   // After pcmpestri tmp(rcx) contains matched element index
3076 
3077   // Make sure string is still long enough
3078   subl(cnt1, tmp);
3079   cmpl(cnt1, cnt2);
3080   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3081   // Left less then substring.
3082 
3083   bind(RET_NOT_FOUND);
3084   movl(result, -1);
3085   jmp(CLEANUP);
3086 
3087   bind(FOUND_SUBSTR);
3088   // Compute start addr of substr
3089   lea(result, Address(result, tmp, scale1));
3090   if (int_cnt2 > 0) { // Constant substring
3091     // Repeat search for small substring (< 8 chars)
3092     // from new point without reloading substring.
3093     // Have to check that we don't read beyond string.
3094     cmpl(tmp, stride-int_cnt2);
3095     jccb(Assembler::greater, ADJUST_STR);
3096     // Fall through if matched whole substring.
3097   } else { // non constant
3098     assert(int_cnt2 == -1, "should be != 0");
3099 
3100     addl(tmp, cnt2);
3101     // Found result if we matched whole substring.
3102     cmpl(tmp, stride);
3103     jcc(Assembler::lessEqual, RET_FOUND);
3104 
3105     // Repeat search for small substring (<= 8 chars)
3106     // from new point 'str1' without reloading substring.
3107     cmpl(cnt2, stride);
3108     // Have to check that we don't read beyond string.
3109     jccb(Assembler::lessEqual, ADJUST_STR);
3110 
3111     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3112     // Compare the rest of substring (> 8 chars).
3113     movptr(str1, result);
3114 
3115     cmpl(tmp, cnt2);
3116     // First 8 chars are already matched.
3117     jccb(Assembler::equal, CHECK_NEXT);
3118 
3119     bind(SCAN_SUBSTR);
3120     pcmpestri(vec, Address(str1, 0), mode);
3121     // Need to reload strings pointers if not matched whole vector
3122     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3123 
3124     bind(CHECK_NEXT);
3125     subl(cnt2, stride);
3126     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3127     addptr(str1, 16);
3128     if (ae == StrIntrinsicNode::UL) {
3129       addptr(str2, 8);
3130     } else {
3131       addptr(str2, 16);
3132     }
3133     subl(cnt1, stride);
3134     cmpl(cnt2, stride); // Do not read beyond substring
3135     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3136     // Back-up strings to avoid reading beyond substring.
3137 
3138     if (ae == StrIntrinsicNode::UL) {
3139       lea(str2, Address(str2, cnt2, scale2, -8));
3140       lea(str1, Address(str1, cnt2, scale1, -16));
3141     } else {
3142       lea(str2, Address(str2, cnt2, scale2, -16));
3143       lea(str1, Address(str1, cnt2, scale1, -16));
3144     }
3145     subl(cnt1, cnt2);
3146     movl(cnt2, stride);
3147     addl(cnt1, stride);
3148     bind(CONT_SCAN_SUBSTR);
3149     if (ae == StrIntrinsicNode::UL) {
3150       pmovzxbw(vec, Address(str2, 0));
3151     } else {
3152       movdqu(vec, Address(str2, 0));
3153     }
3154     jmp(SCAN_SUBSTR);
3155 
3156     bind(RET_FOUND_LONG);
3157     movptr(str1, Address(rsp, wordSize));
3158   } // non constant
3159 
3160   bind(RET_FOUND);
3161   // Compute substr offset
3162   subptr(result, str1);
3163   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3164     shrl(result, 1); // index
3165   }
3166   bind(CLEANUP);
3167   pop(rsp); // restore SP
3168 
3169 } // string_indexof
3170 
3171 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3172                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3173   ShortBranchVerifier sbv(this);
3174   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3175 
3176   int stride = 8;
3177 
3178   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3179         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3180         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3181         FOUND_SEQ_CHAR, DONE_LABEL;
3182 
3183   movptr(result, str1);
3184   if (UseAVX >= 2) {
3185     cmpl(cnt1, stride);
3186     jcc(Assembler::less, SCAN_TO_CHAR);
3187     cmpl(cnt1, 2*stride);
3188     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3189     movdl(vec1, ch);
3190     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3191     vpxor(vec2, vec2);
3192     movl(tmp, cnt1);
3193     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3194     andl(cnt1,0x0000000F);  //tail count (in chars)
3195 
3196     bind(SCAN_TO_16_CHAR_LOOP);
3197     vmovdqu(vec3, Address(result, 0));
3198     vpcmpeqw(vec3, vec3, vec1, 1);
3199     vptest(vec2, vec3);
3200     jcc(Assembler::carryClear, FOUND_CHAR);
3201     addptr(result, 32);
3202     subl(tmp, 2*stride);
3203     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3204     jmp(SCAN_TO_8_CHAR);
3205     bind(SCAN_TO_8_CHAR_INIT);
3206     movdl(vec1, ch);
3207     pshuflw(vec1, vec1, 0x00);
3208     pshufd(vec1, vec1, 0);
3209     pxor(vec2, vec2);
3210   }
3211   bind(SCAN_TO_8_CHAR);
3212   cmpl(cnt1, stride);
3213   jcc(Assembler::less, SCAN_TO_CHAR);
3214   if (UseAVX < 2) {
3215     movdl(vec1, ch);
3216     pshuflw(vec1, vec1, 0x00);
3217     pshufd(vec1, vec1, 0);
3218     pxor(vec2, vec2);
3219   }
3220   movl(tmp, cnt1);
3221   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3222   andl(cnt1,0x00000007);  //tail count (in chars)
3223 
3224   bind(SCAN_TO_8_CHAR_LOOP);
3225   movdqu(vec3, Address(result, 0));
3226   pcmpeqw(vec3, vec1);
3227   ptest(vec2, vec3);
3228   jcc(Assembler::carryClear, FOUND_CHAR);
3229   addptr(result, 16);
3230   subl(tmp, stride);
3231   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3232   bind(SCAN_TO_CHAR);
3233   testl(cnt1, cnt1);
3234   jcc(Assembler::zero, RET_NOT_FOUND);
3235   bind(SCAN_TO_CHAR_LOOP);
3236   load_unsigned_short(tmp, Address(result, 0));
3237   cmpl(ch, tmp);
3238   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3239   addptr(result, 2);
3240   subl(cnt1, 1);
3241   jccb(Assembler::zero, RET_NOT_FOUND);
3242   jmp(SCAN_TO_CHAR_LOOP);
3243 
3244   bind(RET_NOT_FOUND);
3245   movl(result, -1);
3246   jmpb(DONE_LABEL);
3247 
3248   bind(FOUND_CHAR);
3249   if (UseAVX >= 2) {
3250     vpmovmskb(tmp, vec3);
3251   } else {
3252     pmovmskb(tmp, vec3);
3253   }
3254   bsfl(ch, tmp);
3255   addptr(result, ch);
3256 
3257   bind(FOUND_SEQ_CHAR);
3258   subptr(result, str1);
3259   shrl(result, 1);
3260 
3261   bind(DONE_LABEL);
3262 } // string_indexof_char
3263 
3264 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3265                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3266   ShortBranchVerifier sbv(this);
3267   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3268 
3269   int stride = 16;
3270 
3271   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3272         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3273         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3274         FOUND_SEQ_CHAR, DONE_LABEL;
3275 
3276   movptr(result, str1);
3277   if (UseAVX >= 2) {
3278     cmpl(cnt1, stride);
3279     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3280     cmpl(cnt1, stride*2);
3281     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3282     movdl(vec1, ch);
3283     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3284     vpxor(vec2, vec2);
3285     movl(tmp, cnt1);
3286     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3287     andl(cnt1,0x0000001F);  //tail count (in chars)
3288 
3289     bind(SCAN_TO_32_CHAR_LOOP);
3290     vmovdqu(vec3, Address(result, 0));
3291     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3292     vptest(vec2, vec3);
3293     jcc(Assembler::carryClear, FOUND_CHAR);
3294     addptr(result, 32);
3295     subl(tmp, stride*2);
3296     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3297     jmp(SCAN_TO_16_CHAR);
3298 
3299     bind(SCAN_TO_16_CHAR_INIT);
3300     movdl(vec1, ch);
3301     pxor(vec2, vec2);
3302     pshufb(vec1, vec2);
3303   }
3304 
3305   bind(SCAN_TO_16_CHAR);
3306   cmpl(cnt1, stride);
3307   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3308   if (UseAVX < 2) {
3309     movdl(vec1, ch);
3310     pxor(vec2, vec2);
3311     pshufb(vec1, vec2);
3312   }
3313   movl(tmp, cnt1);
3314   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3315   andl(cnt1,0x0000000F);  //tail count (in bytes)
3316 
3317   bind(SCAN_TO_16_CHAR_LOOP);
3318   movdqu(vec3, Address(result, 0));
3319   pcmpeqb(vec3, vec1);
3320   ptest(vec2, vec3);
3321   jcc(Assembler::carryClear, FOUND_CHAR);
3322   addptr(result, 16);
3323   subl(tmp, stride);
3324   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3325 
3326   bind(SCAN_TO_CHAR_INIT);
3327   testl(cnt1, cnt1);
3328   jcc(Assembler::zero, RET_NOT_FOUND);
3329   bind(SCAN_TO_CHAR_LOOP);
3330   load_unsigned_byte(tmp, Address(result, 0));
3331   cmpl(ch, tmp);
3332   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3333   addptr(result, 1);
3334   subl(cnt1, 1);
3335   jccb(Assembler::zero, RET_NOT_FOUND);
3336   jmp(SCAN_TO_CHAR_LOOP);
3337 
3338   bind(RET_NOT_FOUND);
3339   movl(result, -1);
3340   jmpb(DONE_LABEL);
3341 
3342   bind(FOUND_CHAR);
3343   if (UseAVX >= 2) {
3344     vpmovmskb(tmp, vec3);
3345   } else {
3346     pmovmskb(tmp, vec3);
3347   }
3348   bsfl(ch, tmp);
3349   addptr(result, ch);
3350 
3351   bind(FOUND_SEQ_CHAR);
3352   subptr(result, str1);
3353 
3354   bind(DONE_LABEL);
3355 } // stringL_indexof_char
3356 
3357 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3358   switch (eltype) {
3359   case T_BOOLEAN: return sizeof(jboolean);
3360   case T_BYTE:  return sizeof(jbyte);
3361   case T_SHORT: return sizeof(jshort);
3362   case T_CHAR:  return sizeof(jchar);
3363   case T_INT:   return sizeof(jint);
3364   default:
3365     ShouldNotReachHere();
3366     return -1;
3367   }
3368 }
3369 
3370 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3371   switch (eltype) {
3372   // T_BOOLEAN used as surrogate for unsigned byte
3373   case T_BOOLEAN: movzbl(dst, src);   break;
3374   case T_BYTE:    movsbl(dst, src);   break;
3375   case T_SHORT:   movswl(dst, src);   break;
3376   case T_CHAR:    movzwl(dst, src);   break;
3377   case T_INT:     movl(dst, src);     break;
3378   default:
3379     ShouldNotReachHere();
3380   }
3381 }
3382 
3383 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3384   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3385 }
3386 
3387 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3388   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3389 }
3390 
3391 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3392   const int vlen = Assembler::AVX_256bit;
3393   switch (eltype) {
3394   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3395   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3396   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3397   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3398   case T_INT:
3399     // do nothing
3400     break;
3401   default:
3402     ShouldNotReachHere();
3403   }
3404 }
3405 
3406 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3407                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3408                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3409                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3410                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3411                                         BasicType eltype) {
3412   ShortBranchVerifier sbv(this);
3413   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3414   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3415   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3416 
3417   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3418         SHORT_UNROLLED_LOOP_EXIT,
3419         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3420         UNROLLED_VECTOR_LOOP_BEGIN,
3421         END;
3422   switch (eltype) {
3423   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3424   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3425   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3426   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3427   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3428   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3429   }
3430 
3431   // For "renaming" for readibility of the code
3432   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3433                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3434                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3435 
3436   const int elsize = arrays_hashcode_elsize(eltype);
3437 
3438   /*
3439     if (cnt1 >= 2) {
3440       if (cnt1 >= 32) {
3441         UNROLLED VECTOR LOOP
3442       }
3443       UNROLLED SCALAR LOOP
3444     }
3445     SINGLE SCALAR
3446    */
3447 
3448   cmpl(cnt1, 32);
3449   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3450 
3451   // cnt1 >= 32 && generate_vectorized_loop
3452   xorl(index, index);
3453 
3454   // vresult = IntVector.zero(I256);
3455   for (int idx = 0; idx < 4; idx++) {
3456     vpxor(vresult[idx], vresult[idx]);
3457   }
3458   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3459   Register bound = tmp2;
3460   Register next = tmp3;
3461   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3462   movl(next, Address(tmp2, 0));
3463   movdl(vnext, next);
3464   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3465 
3466   // index = 0;
3467   // bound = cnt1 & ~(32 - 1);
3468   movl(bound, cnt1);
3469   andl(bound, ~(32 - 1));
3470   // for (; index < bound; index += 32) {
3471   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3472   // result *= next;
3473   imull(result, next);
3474   // loop fission to upfront the cost of fetching from memory, OOO execution
3475   // can then hopefully do a better job of prefetching
3476   for (int idx = 0; idx < 4; idx++) {
3477     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3478   }
3479   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3480   for (int idx = 0; idx < 4; idx++) {
3481     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3482     arrays_hashcode_elvcast(vtmp[idx], eltype);
3483     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3484   }
3485   // index += 32;
3486   addl(index, 32);
3487   // index < bound;
3488   cmpl(index, bound);
3489   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3490   // }
3491 
3492   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3493   subl(cnt1, bound);
3494   // release bound
3495 
3496   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3497   for (int idx = 0; idx < 4; idx++) {
3498     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3499     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3500     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3501   }
3502   // result += vresult.reduceLanes(ADD);
3503   for (int idx = 0; idx < 4; idx++) {
3504     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3505   }
3506 
3507   // } else if (cnt1 < 32) {
3508 
3509   bind(SHORT_UNROLLED_BEGIN);
3510   // int i = 1;
3511   movl(index, 1);
3512   cmpl(index, cnt1);
3513   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3514 
3515   // for (; i < cnt1 ; i += 2) {
3516   bind(SHORT_UNROLLED_LOOP_BEGIN);
3517   movl(tmp3, 961);
3518   imull(result, tmp3);
3519   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3520   movl(tmp3, tmp2);
3521   shll(tmp3, 5);
3522   subl(tmp3, tmp2);
3523   addl(result, tmp3);
3524   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3525   addl(result, tmp3);
3526   addl(index, 2);
3527   cmpl(index, cnt1);
3528   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3529 
3530   // }
3531   // if (i >= cnt1) {
3532   bind(SHORT_UNROLLED_LOOP_EXIT);
3533   jccb(Assembler::greater, END);
3534   movl(tmp2, result);
3535   shll(result, 5);
3536   subl(result, tmp2);
3537   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3538   addl(result, tmp3);
3539   // }
3540   bind(END);
3541 
3542   BLOCK_COMMENT("} // arrays_hashcode");
3543 
3544 } // arrays_hashcode
3545 
3546 // helper function for string_compare
3547 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3548                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3549                                            Address::ScaleFactor scale2, Register index, int ae) {
3550   if (ae == StrIntrinsicNode::LL) {
3551     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3552     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3553   } else if (ae == StrIntrinsicNode::UU) {
3554     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3555     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3556   } else {
3557     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3558     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3559   }
3560 }
3561 
3562 // Compare strings, used for char[] and byte[].
3563 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3564                                        Register cnt1, Register cnt2, Register result,
3565                                        XMMRegister vec1, int ae, KRegister mask) {
3566   ShortBranchVerifier sbv(this);
3567   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3568   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only AVX3
3569   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3570   int stride2x2 = 0x40;
3571   Address::ScaleFactor scale = Address::no_scale;
3572   Address::ScaleFactor scale1 = Address::no_scale;
3573   Address::ScaleFactor scale2 = Address::no_scale;
3574 
3575   if (ae != StrIntrinsicNode::LL) {
3576     stride2x2 = 0x20;
3577   }
3578 
3579   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3580     shrl(cnt2, 1);
3581   }
3582   // Compute the minimum of the string lengths and the
3583   // difference of the string lengths (stack).
3584   // Do the conditional move stuff
3585   movl(result, cnt1);
3586   subl(cnt1, cnt2);
3587   push(cnt1);
3588   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3589 
3590   // Is the minimum length zero?
3591   testl(cnt2, cnt2);
3592   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3593   if (ae == StrIntrinsicNode::LL) {
3594     // Load first bytes
3595     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3596     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3597   } else if (ae == StrIntrinsicNode::UU) {
3598     // Load first characters
3599     load_unsigned_short(result, Address(str1, 0));
3600     load_unsigned_short(cnt1, Address(str2, 0));
3601   } else {
3602     load_unsigned_byte(result, Address(str1, 0));
3603     load_unsigned_short(cnt1, Address(str2, 0));
3604   }
3605   subl(result, cnt1);
3606   jcc(Assembler::notZero,  POP_LABEL);
3607 
3608   if (ae == StrIntrinsicNode::UU) {
3609     // Divide length by 2 to get number of chars
3610     shrl(cnt2, 1);
3611   }
3612   cmpl(cnt2, 1);
3613   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3614 
3615   // Check if the strings start at the same location and setup scale and stride
3616   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3617     cmpptr(str1, str2);
3618     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3619     if (ae == StrIntrinsicNode::LL) {
3620       scale = Address::times_1;
3621       stride = 16;
3622     } else {
3623       scale = Address::times_2;
3624       stride = 8;
3625     }
3626   } else {
3627     scale1 = Address::times_1;
3628     scale2 = Address::times_2;
3629     // scale not used
3630     stride = 8;
3631   }
3632 
3633   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3634     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3635     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3636     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3637     Label COMPARE_TAIL_LONG;
3638     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only AVX3
3639 
3640     int pcmpmask = 0x19;
3641     if (ae == StrIntrinsicNode::LL) {
3642       pcmpmask &= ~0x01;
3643     }
3644 
3645     // Setup to compare 16-chars (32-bytes) vectors,
3646     // start from first character again because it has aligned address.
3647     if (ae == StrIntrinsicNode::LL) {
3648       stride2 = 32;
3649     } else {
3650       stride2 = 16;
3651     }
3652     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3653       adr_stride = stride << scale;
3654     } else {
3655       adr_stride1 = 8;  //stride << scale1;
3656       adr_stride2 = 16; //stride << scale2;
3657     }
3658 
3659     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3660     // rax and rdx are used by pcmpestri as elements counters
3661     movl(result, cnt2);
3662     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3663     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3664 
3665     // fast path : compare first 2 8-char vectors.
3666     bind(COMPARE_16_CHARS);
3667     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3668       movdqu(vec1, Address(str1, 0));
3669     } else {
3670       pmovzxbw(vec1, Address(str1, 0));
3671     }
3672     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3673     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3674 
3675     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3676       movdqu(vec1, Address(str1, adr_stride));
3677       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3678     } else {
3679       pmovzxbw(vec1, Address(str1, adr_stride1));
3680       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3681     }
3682     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3683     addl(cnt1, stride);
3684 
3685     // Compare the characters at index in cnt1
3686     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3687     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3688     subl(result, cnt2);
3689     jmp(POP_LABEL);
3690 
3691     // Setup the registers to start vector comparison loop
3692     bind(COMPARE_WIDE_VECTORS);
3693     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3694       lea(str1, Address(str1, result, scale));
3695       lea(str2, Address(str2, result, scale));
3696     } else {
3697       lea(str1, Address(str1, result, scale1));
3698       lea(str2, Address(str2, result, scale2));
3699     }
3700     subl(result, stride2);
3701     subl(cnt2, stride2);
3702     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3703     negptr(result);
3704 
3705     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3706     bind(COMPARE_WIDE_VECTORS_LOOP);
3707 
3708     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3709       cmpl(cnt2, stride2x2);
3710       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3711       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3712       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3713 
3714       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3715       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3716         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3717         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3718       } else {
3719         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3720         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3721       }
3722       kortestql(mask, mask);
3723       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3724       addptr(result, stride2x2);  // update since we already compared at this addr
3725       subl(cnt2, stride2x2);      // and sub the size too
3726       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3727 
3728       vpxor(vec1, vec1);
3729       jmpb(COMPARE_WIDE_TAIL);
3730     }//if (VM_Version::supports_avx512vlbw())
3731 
3732     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3733     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3734       vmovdqu(vec1, Address(str1, result, scale));
3735       vpxor(vec1, Address(str2, result, scale));
3736     } else {
3737       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3738       vpxor(vec1, Address(str2, result, scale2));
3739     }
3740     vptest(vec1, vec1);
3741     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3742     addptr(result, stride2);
3743     subl(cnt2, stride2);
3744     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3745     // clean upper bits of YMM registers
3746     vpxor(vec1, vec1);
3747 
3748     // compare wide vectors tail
3749     bind(COMPARE_WIDE_TAIL);
3750     testptr(result, result);
3751     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3752 
3753     movl(result, stride2);
3754     movl(cnt2, result);
3755     negptr(result);
3756     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3757 
3758     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3759     bind(VECTOR_NOT_EQUAL);
3760     // clean upper bits of YMM registers
3761     vpxor(vec1, vec1);
3762     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3763       lea(str1, Address(str1, result, scale));
3764       lea(str2, Address(str2, result, scale));
3765     } else {
3766       lea(str1, Address(str1, result, scale1));
3767       lea(str2, Address(str2, result, scale2));
3768     }
3769     jmp(COMPARE_16_CHARS);
3770 
3771     // Compare tail chars, length between 1 to 15 chars
3772     bind(COMPARE_TAIL_LONG);
3773     movl(cnt2, result);
3774     cmpl(cnt2, stride);
3775     jcc(Assembler::less, COMPARE_SMALL_STR);
3776 
3777     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3778       movdqu(vec1, Address(str1, 0));
3779     } else {
3780       pmovzxbw(vec1, Address(str1, 0));
3781     }
3782     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3783     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3784     subptr(cnt2, stride);
3785     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3786     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3787       lea(str1, Address(str1, result, scale));
3788       lea(str2, Address(str2, result, scale));
3789     } else {
3790       lea(str1, Address(str1, result, scale1));
3791       lea(str2, Address(str2, result, scale2));
3792     }
3793     negptr(cnt2);
3794     jmpb(WHILE_HEAD_LABEL);
3795 
3796     bind(COMPARE_SMALL_STR);
3797   } else if (UseSSE42Intrinsics) {
3798     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3799     int pcmpmask = 0x19;
3800     // Setup to compare 8-char (16-byte) vectors,
3801     // start from first character again because it has aligned address.
3802     movl(result, cnt2);
3803     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3804     if (ae == StrIntrinsicNode::LL) {
3805       pcmpmask &= ~0x01;
3806     }
3807     jcc(Assembler::zero, COMPARE_TAIL);
3808     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3809       lea(str1, Address(str1, result, scale));
3810       lea(str2, Address(str2, result, scale));
3811     } else {
3812       lea(str1, Address(str1, result, scale1));
3813       lea(str2, Address(str2, result, scale2));
3814     }
3815     negptr(result);
3816 
3817     // pcmpestri
3818     //   inputs:
3819     //     vec1- substring
3820     //     rax - negative string length (elements count)
3821     //     mem - scanned string
3822     //     rdx - string length (elements count)
3823     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3824     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3825     //   outputs:
3826     //     rcx - first mismatched element index
3827     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3828 
3829     bind(COMPARE_WIDE_VECTORS);
3830     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3831       movdqu(vec1, Address(str1, result, scale));
3832       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3833     } else {
3834       pmovzxbw(vec1, Address(str1, result, scale1));
3835       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3836     }
3837     // After pcmpestri cnt1(rcx) contains mismatched element index
3838 
3839     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3840     addptr(result, stride);
3841     subptr(cnt2, stride);
3842     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3843 
3844     // compare wide vectors tail
3845     testptr(result, result);
3846     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3847 
3848     movl(cnt2, stride);
3849     movl(result, stride);
3850     negptr(result);
3851     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3852       movdqu(vec1, Address(str1, result, scale));
3853       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3854     } else {
3855       pmovzxbw(vec1, Address(str1, result, scale1));
3856       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3857     }
3858     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3859 
3860     // Mismatched characters in the vectors
3861     bind(VECTOR_NOT_EQUAL);
3862     addptr(cnt1, result);
3863     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3864     subl(result, cnt2);
3865     jmpb(POP_LABEL);
3866 
3867     bind(COMPARE_TAIL); // limit is zero
3868     movl(cnt2, result);
3869     // Fallthru to tail compare
3870   }
3871   // Shift str2 and str1 to the end of the arrays, negate min
3872   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3873     lea(str1, Address(str1, cnt2, scale));
3874     lea(str2, Address(str2, cnt2, scale));
3875   } else {
3876     lea(str1, Address(str1, cnt2, scale1));
3877     lea(str2, Address(str2, cnt2, scale2));
3878   }
3879   decrementl(cnt2);  // first character was compared already
3880   negptr(cnt2);
3881 
3882   // Compare the rest of the elements
3883   bind(WHILE_HEAD_LABEL);
3884   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3885   subl(result, cnt1);
3886   jccb(Assembler::notZero, POP_LABEL);
3887   increment(cnt2);
3888   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3889 
3890   // Strings are equal up to min length.  Return the length difference.
3891   bind(LENGTH_DIFF_LABEL);
3892   pop(result);
3893   if (ae == StrIntrinsicNode::UU) {
3894     // Divide diff by 2 to get number of chars
3895     sarl(result, 1);
3896   }
3897   jmpb(DONE_LABEL);
3898 
3899   if (VM_Version::supports_avx512vlbw()) {
3900 
3901     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3902 
3903     kmovql(cnt1, mask);
3904     notq(cnt1);
3905     bsfq(cnt2, cnt1);
3906     if (ae != StrIntrinsicNode::LL) {
3907       // Divide diff by 2 to get number of chars
3908       sarl(cnt2, 1);
3909     }
3910     addq(result, cnt2);
3911     if (ae == StrIntrinsicNode::LL) {
3912       load_unsigned_byte(cnt1, Address(str2, result));
3913       load_unsigned_byte(result, Address(str1, result));
3914     } else if (ae == StrIntrinsicNode::UU) {
3915       load_unsigned_short(cnt1, Address(str2, result, scale));
3916       load_unsigned_short(result, Address(str1, result, scale));
3917     } else {
3918       load_unsigned_short(cnt1, Address(str2, result, scale2));
3919       load_unsigned_byte(result, Address(str1, result, scale1));
3920     }
3921     subl(result, cnt1);
3922     jmpb(POP_LABEL);
3923   }//if (VM_Version::supports_avx512vlbw())
3924 
3925   // Discard the stored length difference
3926   bind(POP_LABEL);
3927   pop(cnt1);
3928 
3929   // That's it
3930   bind(DONE_LABEL);
3931   if(ae == StrIntrinsicNode::UL) {
3932     negl(result);
3933   }
3934 
3935 }
3936 
3937 // Search for Non-ASCII character (Negative byte value) in a byte array,
3938 // return the index of the first such character, otherwise the length
3939 // of the array segment searched.
3940 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3941 //   @IntrinsicCandidate
3942 //   public static int countPositives(byte[] ba, int off, int len) {
3943 //     for (int i = off; i < off + len; i++) {
3944 //       if (ba[i] < 0) {
3945 //         return i - off;
3946 //       }
3947 //     }
3948 //     return len;
3949 //   }
3950 void C2_MacroAssembler::count_positives(Register ary1, Register len,
3951   Register result, Register tmp1,
3952   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3953   // rsi: byte array
3954   // rcx: len
3955   // rax: result
3956   ShortBranchVerifier sbv(this);
3957   assert_different_registers(ary1, len, result, tmp1);
3958   assert_different_registers(vec1, vec2);
3959   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3960 
3961   movl(result, len); // copy
3962   // len == 0
3963   testl(len, len);
3964   jcc(Assembler::zero, DONE);
3965 
3966   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3967     VM_Version::supports_avx512vlbw() &&
3968     VM_Version::supports_bmi2()) {
3969 
3970     Label test_64_loop, test_tail, BREAK_LOOP;
3971     movl(tmp1, len);
3972     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3973 
3974     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
3975     andl(len,  0xffffffc0); // vector count (in chars)
3976     jccb(Assembler::zero, test_tail);
3977 
3978     lea(ary1, Address(ary1, len, Address::times_1));
3979     negptr(len);
3980 
3981     bind(test_64_loop);
3982     // Check whether our 64 elements of size byte contain negatives
3983     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3984     kortestql(mask1, mask1);
3985     jcc(Assembler::notZero, BREAK_LOOP);
3986 
3987     addptr(len, 64);
3988     jccb(Assembler::notZero, test_64_loop);
3989 
3990     bind(test_tail);
3991     // bail out when there is nothing to be done
3992     testl(tmp1, -1);
3993     jcc(Assembler::zero, DONE);
3994 
3995 
3996     // check the tail for absense of negatives
3997     // ~(~0 << len) applied up to two times (for 32-bit scenario)
3998     {
3999       Register tmp3_aliased = len;
4000       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4001       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4002       notq(tmp3_aliased);
4003       kmovql(mask2, tmp3_aliased);
4004     }
4005 
4006     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4007     ktestq(mask1, mask2);
4008     jcc(Assembler::zero, DONE);
4009 
4010     // do a full check for negative registers in the tail
4011     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4012                      // ary1 already pointing to the right place
4013     jmpb(TAIL_START);
4014 
4015     bind(BREAK_LOOP);
4016     // At least one byte in the last 64 byte block was negative.
4017     // Set up to look at the last 64 bytes as if they were a tail
4018     lea(ary1, Address(ary1, len, Address::times_1));
4019     addptr(result, len);
4020     // Ignore the very last byte: if all others are positive,
4021     // it must be negative, so we can skip right to the 2+1 byte
4022     // end comparison at this point
4023     orl(result, 63);
4024     movl(len, 63);
4025     // Fallthru to tail compare
4026   } else {
4027 
4028     if (UseAVX >= 2) {
4029       // With AVX2, use 32-byte vector compare
4030       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4031 
4032       // Compare 32-byte vectors
4033       testl(len, 0xffffffe0);   // vector count (in bytes)
4034       jccb(Assembler::zero, TAIL_START);
4035 
4036       andl(len, 0xffffffe0);
4037       lea(ary1, Address(ary1, len, Address::times_1));
4038       negptr(len);
4039 
4040       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4041       movdl(vec2, tmp1);
4042       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4043 
4044       bind(COMPARE_WIDE_VECTORS);
4045       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4046       vptest(vec1, vec2);
4047       jccb(Assembler::notZero, BREAK_LOOP);
4048       addptr(len, 32);
4049       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4050 
4051       testl(result, 0x0000001f);   // any bytes remaining?
4052       jcc(Assembler::zero, DONE);
4053 
4054       // Quick test using the already prepared vector mask
4055       movl(len, result);
4056       andl(len, 0x0000001f);
4057       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4058       vptest(vec1, vec2);
4059       jcc(Assembler::zero, DONE);
4060       // There are zeros, jump to the tail to determine exactly where
4061       jmpb(TAIL_START);
4062 
4063       bind(BREAK_LOOP);
4064       // At least one byte in the last 32-byte vector is negative.
4065       // Set up to look at the last 32 bytes as if they were a tail
4066       lea(ary1, Address(ary1, len, Address::times_1));
4067       addptr(result, len);
4068       // Ignore the very last byte: if all others are positive,
4069       // it must be negative, so we can skip right to the 2+1 byte
4070       // end comparison at this point
4071       orl(result, 31);
4072       movl(len, 31);
4073       // Fallthru to tail compare
4074     } else if (UseSSE42Intrinsics) {
4075       // With SSE4.2, use double quad vector compare
4076       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4077 
4078       // Compare 16-byte vectors
4079       testl(len, 0xfffffff0);   // vector count (in bytes)
4080       jcc(Assembler::zero, TAIL_START);
4081 
4082       andl(len, 0xfffffff0);
4083       lea(ary1, Address(ary1, len, Address::times_1));
4084       negptr(len);
4085 
4086       movl(tmp1, 0x80808080);
4087       movdl(vec2, tmp1);
4088       pshufd(vec2, vec2, 0);
4089 
4090       bind(COMPARE_WIDE_VECTORS);
4091       movdqu(vec1, Address(ary1, len, Address::times_1));
4092       ptest(vec1, vec2);
4093       jccb(Assembler::notZero, BREAK_LOOP);
4094       addptr(len, 16);
4095       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4096 
4097       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4098       jcc(Assembler::zero, DONE);
4099 
4100       // Quick test using the already prepared vector mask
4101       movl(len, result);
4102       andl(len, 0x0000000f);   // tail count (in bytes)
4103       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4104       ptest(vec1, vec2);
4105       jcc(Assembler::zero, DONE);
4106       jmpb(TAIL_START);
4107 
4108       bind(BREAK_LOOP);
4109       // At least one byte in the last 16-byte vector is negative.
4110       // Set up and look at the last 16 bytes as if they were a tail
4111       lea(ary1, Address(ary1, len, Address::times_1));
4112       addptr(result, len);
4113       // Ignore the very last byte: if all others are positive,
4114       // it must be negative, so we can skip right to the 2+1 byte
4115       // end comparison at this point
4116       orl(result, 15);
4117       movl(len, 15);
4118       // Fallthru to tail compare
4119     }
4120   }
4121 
4122   bind(TAIL_START);
4123   // Compare 4-byte vectors
4124   andl(len, 0xfffffffc); // vector count (in bytes)
4125   jccb(Assembler::zero, COMPARE_CHAR);
4126 
4127   lea(ary1, Address(ary1, len, Address::times_1));
4128   negptr(len);
4129 
4130   bind(COMPARE_VECTORS);
4131   movl(tmp1, Address(ary1, len, Address::times_1));
4132   andl(tmp1, 0x80808080);
4133   jccb(Assembler::notZero, TAIL_ADJUST);
4134   addptr(len, 4);
4135   jccb(Assembler::notZero, COMPARE_VECTORS);
4136 
4137   // Compare trailing char (final 2-3 bytes), if any
4138   bind(COMPARE_CHAR);
4139 
4140   testl(result, 0x2);   // tail  char
4141   jccb(Assembler::zero, COMPARE_BYTE);
4142   load_unsigned_short(tmp1, Address(ary1, 0));
4143   andl(tmp1, 0x00008080);
4144   jccb(Assembler::notZero, CHAR_ADJUST);
4145   lea(ary1, Address(ary1, 2));
4146 
4147   bind(COMPARE_BYTE);
4148   testl(result, 0x1);   // tail  byte
4149   jccb(Assembler::zero, DONE);
4150   load_unsigned_byte(tmp1, Address(ary1, 0));
4151   testl(tmp1, 0x00000080);
4152   jccb(Assembler::zero, DONE);
4153   subptr(result, 1);
4154   jmpb(DONE);
4155 
4156   bind(TAIL_ADJUST);
4157   // there are negative bits in the last 4 byte block.
4158   // Adjust result and check the next three bytes
4159   addptr(result, len);
4160   orl(result, 3);
4161   lea(ary1, Address(ary1, len, Address::times_1));
4162   jmpb(COMPARE_CHAR);
4163 
4164   bind(CHAR_ADJUST);
4165   // We are looking at a char + optional byte tail, and found that one
4166   // of the bytes in the char is negative. Adjust the result, check the
4167   // first byte and readjust if needed.
4168   andl(result, 0xfffffffc);
4169   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4170   jccb(Assembler::notZero, DONE);
4171   addptr(result, 1);
4172 
4173   // That's it
4174   bind(DONE);
4175   if (UseAVX >= 2) {
4176     // clean upper bits of YMM registers
4177     vpxor(vec1, vec1);
4178     vpxor(vec2, vec2);
4179   }
4180 }
4181 
4182 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4183 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4184                                       Register limit, Register result, Register chr,
4185                                       XMMRegister vec1, XMMRegister vec2, bool is_char,
4186                                       KRegister mask, bool expand_ary2) {
4187   // for expand_ary2, limit is the (smaller) size of the second array.
4188   ShortBranchVerifier sbv(this);
4189   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4190 
4191   assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4192          "Expansion only implemented for AVX2");
4193 
4194   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4195   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4196 
4197   Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4198   int scaleIncr = expand_ary2 ? 8 : 16;
4199 
4200   if (is_array_equ) {
4201     // Check the input args
4202     cmpoop(ary1, ary2);
4203     jcc(Assembler::equal, TRUE_LABEL);
4204 
4205     // Need additional checks for arrays_equals.
4206     testptr(ary1, ary1);
4207     jcc(Assembler::zero, FALSE_LABEL);
4208     testptr(ary2, ary2);
4209     jcc(Assembler::zero, FALSE_LABEL);
4210 
4211     // Check the lengths
4212     movl(limit, Address(ary1, length_offset));
4213     cmpl(limit, Address(ary2, length_offset));
4214     jcc(Assembler::notEqual, FALSE_LABEL);
4215   }
4216 
4217   // count == 0
4218   testl(limit, limit);
4219   jcc(Assembler::zero, TRUE_LABEL);
4220 
4221   if (is_array_equ) {
4222     // Load array address
4223     lea(ary1, Address(ary1, base_offset));
4224     lea(ary2, Address(ary2, base_offset));
4225   }
4226 
4227   if (is_array_equ && is_char) {
4228     // arrays_equals when used for char[].
4229     shll(limit, 1);      // byte count != 0
4230   }
4231   movl(result, limit); // copy
4232 
4233   if (UseAVX >= 2) {
4234     // With AVX2, use 32-byte vector compare
4235     Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4236 
4237     // Compare 32-byte vectors
4238     if (expand_ary2) {
4239       andl(result, 0x0000000f);  //   tail count (in bytes)
4240       andl(limit, 0xfffffff0);   // vector count (in bytes)
4241       jcc(Assembler::zero, COMPARE_TAIL);
4242     } else {
4243       andl(result, 0x0000001f);  //   tail count (in bytes)
4244       andl(limit, 0xffffffe0);   // vector count (in bytes)
4245       jcc(Assembler::zero, COMPARE_TAIL_16);
4246     }
4247 
4248     lea(ary1, Address(ary1, limit, scaleFactor));
4249     lea(ary2, Address(ary2, limit, Address::times_1));
4250     negptr(limit);
4251 
4252     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4253       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4254 
4255       cmpl(limit, -64);
4256       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4257 
4258       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4259 
4260       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4261       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4262       kortestql(mask, mask);
4263       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4264       addptr(limit, 64);  // update since we already compared at this addr
4265       cmpl(limit, -64);
4266       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4267 
4268       // At this point we may still need to compare -limit+result bytes.
4269       // We could execute the next two instruction and just continue via non-wide path:
4270       //  cmpl(limit, 0);
4271       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4272       // But since we stopped at the points ary{1,2}+limit which are
4273       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4274       // (|limit| <= 32 and result < 32),
4275       // we may just compare the last 64 bytes.
4276       //
4277       addptr(result, -64);   // it is safe, bc we just came from this area
4278       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4279       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4280       kortestql(mask, mask);
4281       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4282 
4283       jmp(TRUE_LABEL);
4284 
4285       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4286 
4287     }//if (VM_Version::supports_avx512vlbw())
4288 
4289     bind(COMPARE_WIDE_VECTORS);
4290     vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4291     if (expand_ary2) {
4292       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4293     } else {
4294       vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4295     }
4296     vpxor(vec1, vec2);
4297 
4298     vptest(vec1, vec1);
4299     jcc(Assembler::notZero, FALSE_LABEL);
4300     addptr(limit, scaleIncr * 2);
4301     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4302 
4303     testl(result, result);
4304     jcc(Assembler::zero, TRUE_LABEL);
4305 
4306     vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4307     if (expand_ary2) {
4308       vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4309     } else {
4310       vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4311     }
4312     vpxor(vec1, vec2);
4313 
4314     vptest(vec1, vec1);
4315     jcc(Assembler::notZero, FALSE_LABEL);
4316     jmp(TRUE_LABEL);
4317 
4318     bind(COMPARE_TAIL_16); // limit is zero
4319     movl(limit, result);
4320 
4321     // Compare 16-byte chunks
4322     andl(result, 0x0000000f);  //   tail count (in bytes)
4323     andl(limit, 0xfffffff0);   // vector count (in bytes)
4324     jcc(Assembler::zero, COMPARE_TAIL);
4325 
4326     lea(ary1, Address(ary1, limit, scaleFactor));
4327     lea(ary2, Address(ary2, limit, Address::times_1));
4328     negptr(limit);
4329 
4330     bind(COMPARE_WIDE_VECTORS_16);
4331     movdqu(vec1, Address(ary1, limit, scaleFactor));
4332     if (expand_ary2) {
4333       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4334     } else {
4335       movdqu(vec2, Address(ary2, limit, Address::times_1));
4336     }
4337     pxor(vec1, vec2);
4338 
4339     ptest(vec1, vec1);
4340     jcc(Assembler::notZero, FALSE_LABEL);
4341     addptr(limit, scaleIncr);
4342     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4343 
4344     bind(COMPARE_TAIL); // limit is zero
4345     movl(limit, result);
4346     // Fallthru to tail compare
4347   } else if (UseSSE42Intrinsics) {
4348     // With SSE4.2, use double quad vector compare
4349     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4350 
4351     // Compare 16-byte vectors
4352     andl(result, 0x0000000f);  //   tail count (in bytes)
4353     andl(limit, 0xfffffff0);   // vector count (in bytes)
4354     jcc(Assembler::zero, COMPARE_TAIL);
4355 
4356     lea(ary1, Address(ary1, limit, Address::times_1));
4357     lea(ary2, Address(ary2, limit, Address::times_1));
4358     negptr(limit);
4359 
4360     bind(COMPARE_WIDE_VECTORS);
4361     movdqu(vec1, Address(ary1, limit, Address::times_1));
4362     movdqu(vec2, Address(ary2, limit, Address::times_1));
4363     pxor(vec1, vec2);
4364 
4365     ptest(vec1, vec1);
4366     jcc(Assembler::notZero, FALSE_LABEL);
4367     addptr(limit, 16);
4368     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4369 
4370     testl(result, result);
4371     jcc(Assembler::zero, TRUE_LABEL);
4372 
4373     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4374     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4375     pxor(vec1, vec2);
4376 
4377     ptest(vec1, vec1);
4378     jccb(Assembler::notZero, FALSE_LABEL);
4379     jmpb(TRUE_LABEL);
4380 
4381     bind(COMPARE_TAIL); // limit is zero
4382     movl(limit, result);
4383     // Fallthru to tail compare
4384   }
4385 
4386   // Compare 4-byte vectors
4387   if (expand_ary2) {
4388     testl(result, result);
4389     jccb(Assembler::zero, TRUE_LABEL);
4390   } else {
4391     andl(limit, 0xfffffffc); // vector count (in bytes)
4392     jccb(Assembler::zero, COMPARE_CHAR);
4393   }
4394 
4395   lea(ary1, Address(ary1, limit, scaleFactor));
4396   lea(ary2, Address(ary2, limit, Address::times_1));
4397   negptr(limit);
4398 
4399   bind(COMPARE_VECTORS);
4400   if (expand_ary2) {
4401     // There are no "vector" operations for bytes to shorts
4402     movzbl(chr, Address(ary2, limit, Address::times_1));
4403     cmpw(Address(ary1, limit, Address::times_2), chr);
4404     jccb(Assembler::notEqual, FALSE_LABEL);
4405     addptr(limit, 1);
4406     jcc(Assembler::notZero, COMPARE_VECTORS);
4407     jmp(TRUE_LABEL);
4408   } else {
4409     movl(chr, Address(ary1, limit, Address::times_1));
4410     cmpl(chr, Address(ary2, limit, Address::times_1));
4411     jccb(Assembler::notEqual, FALSE_LABEL);
4412     addptr(limit, 4);
4413     jcc(Assembler::notZero, COMPARE_VECTORS);
4414   }
4415 
4416   // Compare trailing char (final 2 bytes), if any
4417   bind(COMPARE_CHAR);
4418   testl(result, 0x2);   // tail  char
4419   jccb(Assembler::zero, COMPARE_BYTE);
4420   load_unsigned_short(chr, Address(ary1, 0));
4421   load_unsigned_short(limit, Address(ary2, 0));
4422   cmpl(chr, limit);
4423   jccb(Assembler::notEqual, FALSE_LABEL);
4424 
4425   if (is_array_equ && is_char) {
4426     bind(COMPARE_BYTE);
4427   } else {
4428     lea(ary1, Address(ary1, 2));
4429     lea(ary2, Address(ary2, 2));
4430 
4431     bind(COMPARE_BYTE);
4432     testl(result, 0x1);   // tail  byte
4433     jccb(Assembler::zero, TRUE_LABEL);
4434     load_unsigned_byte(chr, Address(ary1, 0));
4435     load_unsigned_byte(limit, Address(ary2, 0));
4436     cmpl(chr, limit);
4437     jccb(Assembler::notEqual, FALSE_LABEL);
4438   }
4439   bind(TRUE_LABEL);
4440   movl(result, 1);   // return true
4441   jmpb(DONE);
4442 
4443   bind(FALSE_LABEL);
4444   xorl(result, result); // return false
4445 
4446   // That's it
4447   bind(DONE);
4448   if (UseAVX >= 2) {
4449     // clean upper bits of YMM registers
4450     vpxor(vec1, vec1);
4451     vpxor(vec2, vec2);
4452   }
4453 }
4454 
4455 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4456 #define __ masm.
4457   Register dst = stub.data<0>();
4458   XMMRegister src = stub.data<1>();
4459   address target = stub.data<2>();
4460   __ bind(stub.entry());
4461   __ subptr(rsp, 8);
4462   __ movdbl(Address(rsp), src);
4463   __ call(RuntimeAddress(target));
4464   // APX REX2 encoding for pop(dst) increases the stub size by 1 byte.
4465   __ pop(dst);
4466   __ jmp(stub.continuation());
4467 #undef __
4468 }
4469 
4470 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4471   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4472   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4473 
4474   address slowpath_target;
4475   if (dst_bt == T_INT) {
4476     if (src_bt == T_FLOAT) {
4477       cvttss2sil(dst, src);
4478       cmpl(dst, 0x80000000);
4479       slowpath_target = StubRoutines::x86::f2i_fixup();
4480     } else {
4481       cvttsd2sil(dst, src);
4482       cmpl(dst, 0x80000000);
4483       slowpath_target = StubRoutines::x86::d2i_fixup();
4484     }
4485   } else {
4486     if (src_bt == T_FLOAT) {
4487       cvttss2siq(dst, src);
4488       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4489       slowpath_target = StubRoutines::x86::f2l_fixup();
4490     } else {
4491       cvttsd2siq(dst, src);
4492       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4493       slowpath_target = StubRoutines::x86::d2l_fixup();
4494     }
4495   }
4496 
4497   // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte.
4498   int max_size = 23 + (UseAPX ? 1 : 0);
4499   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath);
4500   jcc(Assembler::equal, stub->entry());
4501   bind(stub->continuation());
4502 }
4503 
4504 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4505                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4506   switch(ideal_opc) {
4507     case Op_LShiftVS:
4508       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4509     case Op_LShiftVI:
4510       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4511     case Op_LShiftVL:
4512       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4513     case Op_RShiftVS:
4514       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4515     case Op_RShiftVI:
4516       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4517     case Op_RShiftVL:
4518       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4519     case Op_URShiftVS:
4520       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4521     case Op_URShiftVI:
4522       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4523     case Op_URShiftVL:
4524       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4525     case Op_RotateRightV:
4526       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4527     case Op_RotateLeftV:
4528       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4529     default:
4530       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4531       break;
4532   }
4533 }
4534 
4535 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4536                                                XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4537   if (is_unsigned) {
4538     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4539   } else {
4540     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4541   }
4542 }
4543 
4544 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4545                                                       XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4546   switch (elem_bt) {
4547     case T_BYTE:
4548       if (ideal_opc == Op_SaturatingAddV) {
4549         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4550       } else {
4551         assert(ideal_opc == Op_SaturatingSubV, "");
4552         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4553       }
4554       break;
4555     case T_SHORT:
4556       if (ideal_opc == Op_SaturatingAddV) {
4557         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4558       } else {
4559         assert(ideal_opc == Op_SaturatingSubV, "");
4560         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4561       }
4562       break;
4563     default:
4564       fatal("Unsupported type %s", type2name(elem_bt));
4565       break;
4566   }
4567 }
4568 
4569 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4570                                                         XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4571   switch (elem_bt) {
4572     case T_BYTE:
4573       if (ideal_opc == Op_SaturatingAddV) {
4574         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4575       } else {
4576         assert(ideal_opc == Op_SaturatingSubV, "");
4577         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4578       }
4579       break;
4580     case T_SHORT:
4581       if (ideal_opc == Op_SaturatingAddV) {
4582         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4583       } else {
4584         assert(ideal_opc == Op_SaturatingSubV, "");
4585         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4586       }
4587       break;
4588     default:
4589       fatal("Unsupported type %s", type2name(elem_bt));
4590       break;
4591   }
4592 }
4593 
4594 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4595                                                Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4596   if (is_unsigned) {
4597     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4598   } else {
4599     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4600   }
4601 }
4602 
4603 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4604                                                       XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4605   switch (elem_bt) {
4606     case T_BYTE:
4607       if (ideal_opc == Op_SaturatingAddV) {
4608         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4609       } else {
4610         assert(ideal_opc == Op_SaturatingSubV, "");
4611         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4612       }
4613       break;
4614     case T_SHORT:
4615       if (ideal_opc == Op_SaturatingAddV) {
4616         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4617       } else {
4618         assert(ideal_opc == Op_SaturatingSubV, "");
4619         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4620       }
4621       break;
4622     default:
4623       fatal("Unsupported type %s", type2name(elem_bt));
4624       break;
4625   }
4626 }
4627 
4628 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4629                                                         XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4630   switch (elem_bt) {
4631     case T_BYTE:
4632       if (ideal_opc == Op_SaturatingAddV) {
4633         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4634       } else {
4635         assert(ideal_opc == Op_SaturatingSubV, "");
4636         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4637       }
4638       break;
4639     case T_SHORT:
4640       if (ideal_opc == Op_SaturatingAddV) {
4641         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4642       } else {
4643         assert(ideal_opc == Op_SaturatingSubV, "");
4644         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4645       }
4646       break;
4647     default:
4648       fatal("Unsupported type %s", type2name(elem_bt));
4649       break;
4650   }
4651 }
4652 
4653 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4654                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4655                                     bool is_varshift) {
4656   switch (ideal_opc) {
4657     case Op_AddVB:
4658       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4659     case Op_AddVS:
4660       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4661     case Op_AddVI:
4662       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4663     case Op_AddVL:
4664       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4665     case Op_AddVF:
4666       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4667     case Op_AddVD:
4668       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4669     case Op_SubVB:
4670       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4671     case Op_SubVS:
4672       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4673     case Op_SubVI:
4674       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4675     case Op_SubVL:
4676       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4677     case Op_SubVF:
4678       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4679     case Op_SubVD:
4680       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4681     case Op_MulVS:
4682       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4683     case Op_MulVI:
4684       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4685     case Op_MulVL:
4686       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4687     case Op_MulVF:
4688       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4689     case Op_MulVD:
4690       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4691     case Op_DivVF:
4692       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4693     case Op_DivVD:
4694       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4695     case Op_SqrtVF:
4696       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4697     case Op_SqrtVD:
4698       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4699     case Op_AbsVB:
4700       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4701     case Op_AbsVS:
4702       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4703     case Op_AbsVI:
4704       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4705     case Op_AbsVL:
4706       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4707     case Op_FmaVF:
4708       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4709     case Op_FmaVD:
4710       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4711     case Op_VectorRearrange:
4712       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4713     case Op_LShiftVS:
4714       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4715     case Op_LShiftVI:
4716       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4717     case Op_LShiftVL:
4718       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4719     case Op_RShiftVS:
4720       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4721     case Op_RShiftVI:
4722       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4723     case Op_RShiftVL:
4724       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4725     case Op_URShiftVS:
4726       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4727     case Op_URShiftVI:
4728       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4729     case Op_URShiftVL:
4730       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4731     case Op_RotateLeftV:
4732       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4733     case Op_RotateRightV:
4734       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4735     case Op_MaxV:
4736       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4737     case Op_MinV:
4738       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4739     case Op_UMinV:
4740       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4741     case Op_UMaxV:
4742       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4743     case Op_XorV:
4744       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4745     case Op_OrV:
4746       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4747     case Op_AndV:
4748       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4749     default:
4750       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4751       break;
4752   }
4753 }
4754 
4755 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4756                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4757   switch (ideal_opc) {
4758     case Op_AddVB:
4759       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4760     case Op_AddVS:
4761       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4762     case Op_AddVI:
4763       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4764     case Op_AddVL:
4765       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4766     case Op_AddVF:
4767       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4768     case Op_AddVD:
4769       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4770     case Op_SubVB:
4771       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4772     case Op_SubVS:
4773       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4774     case Op_SubVI:
4775       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4776     case Op_SubVL:
4777       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4778     case Op_SubVF:
4779       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4780     case Op_SubVD:
4781       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4782     case Op_MulVS:
4783       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4784     case Op_MulVI:
4785       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4786     case Op_MulVL:
4787       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4788     case Op_MulVF:
4789       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4790     case Op_MulVD:
4791       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4792     case Op_DivVF:
4793       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4794     case Op_DivVD:
4795       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4796     case Op_FmaVF:
4797       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4798     case Op_FmaVD:
4799       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4800     case Op_MaxV:
4801       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4802     case Op_MinV:
4803       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4804     case Op_UMaxV:
4805       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4806     case Op_UMinV:
4807       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4808     case Op_XorV:
4809       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4810     case Op_OrV:
4811       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4812     case Op_AndV:
4813       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4814     default:
4815       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4816       break;
4817   }
4818 }
4819 
4820 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4821                                   KRegister src1, KRegister src2) {
4822   BasicType etype = T_ILLEGAL;
4823   switch(mask_len) {
4824     case 2:
4825     case 4:
4826     case 8:  etype = T_BYTE; break;
4827     case 16: etype = T_SHORT; break;
4828     case 32: etype = T_INT; break;
4829     case 64: etype = T_LONG; break;
4830     default: fatal("Unsupported type"); break;
4831   }
4832   assert(etype != T_ILLEGAL, "");
4833   switch(ideal_opc) {
4834     case Op_AndVMask:
4835       kand(etype, dst, src1, src2); break;
4836     case Op_OrVMask:
4837       kor(etype, dst, src1, src2); break;
4838     case Op_XorVMask:
4839       kxor(etype, dst, src1, src2); break;
4840     default:
4841       fatal("Unsupported masked operation"); break;
4842   }
4843 }
4844 
4845 /*
4846  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4847  * If src is NaN, the result is 0.
4848  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4849  * the result is equal to the value of Integer.MIN_VALUE.
4850  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4851  * the result is equal to the value of Integer.MAX_VALUE.
4852  */
4853 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4854                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4855                                                                    Register rscratch, AddressLiteral float_sign_flip,
4856                                                                    int vec_enc) {
4857   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4858   Label done;
4859   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4860   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4861   vptest(xtmp2, xtmp2, vec_enc);
4862   jccb(Assembler::equal, done);
4863 
4864   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4865   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4866 
4867   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4868   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4869   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4870 
4871   // Recompute the mask for remaining special value.
4872   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4873   // Extract SRC values corresponding to TRUE mask lanes.
4874   vpand(xtmp4, xtmp2, src, vec_enc);
4875   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4876   // values are set.
4877   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4878 
4879   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4880   bind(done);
4881 }
4882 
4883 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4884                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4885                                                                     Register rscratch, AddressLiteral float_sign_flip,
4886                                                                     int vec_enc) {
4887   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4888   Label done;
4889   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4890   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4891   kortestwl(ktmp1, ktmp1);
4892   jccb(Assembler::equal, done);
4893 
4894   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4895   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4896   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4897 
4898   kxorwl(ktmp1, ktmp1, ktmp2);
4899   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4900   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4901   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4902   bind(done);
4903 }
4904 
4905 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4906                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4907                                                                      Register rscratch, AddressLiteral double_sign_flip,
4908                                                                      int vec_enc) {
4909   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4910 
4911   Label done;
4912   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4913   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4914   kortestwl(ktmp1, ktmp1);
4915   jccb(Assembler::equal, done);
4916 
4917   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4918   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4919   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4920 
4921   kxorwl(ktmp1, ktmp1, ktmp2);
4922   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4923   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4924   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4925   bind(done);
4926 }
4927 
4928 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4929                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4930                                                                      Register rscratch, AddressLiteral float_sign_flip,
4931                                                                      int vec_enc) {
4932   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4933   Label done;
4934   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4935   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4936   kortestwl(ktmp1, ktmp1);
4937   jccb(Assembler::equal, done);
4938 
4939   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4940   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4941   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4942 
4943   kxorwl(ktmp1, ktmp1, ktmp2);
4944   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4945   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4946   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4947   bind(done);
4948 }
4949 
4950 /*
4951  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4952  * If src is NaN, the result is 0.
4953  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
4954  * the result is equal to the value of Long.MIN_VALUE.
4955  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
4956  * the result is equal to the value of Long.MAX_VALUE.
4957  */
4958 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4959                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4960                                                                       Register rscratch, AddressLiteral double_sign_flip,
4961                                                                       int vec_enc) {
4962   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4963 
4964   Label done;
4965   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4966   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
4967   kortestwl(ktmp1, ktmp1);
4968   jccb(Assembler::equal, done);
4969 
4970   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4971   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4972   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4973 
4974   kxorwl(ktmp1, ktmp1, ktmp2);
4975   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4976   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4977   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4978   bind(done);
4979 }
4980 
4981 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
4982                                                              XMMRegister xtmp, int index, int vec_enc) {
4983    assert(vec_enc < Assembler::AVX_512bit, "");
4984    if (vec_enc == Assembler::AVX_256bit) {
4985      vextractf128_high(xtmp, src);
4986      vshufps(dst, src, xtmp, index, vec_enc);
4987    } else {
4988      vshufps(dst, src, zero, index, vec_enc);
4989    }
4990 }
4991 
4992 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4993                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
4994                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
4995   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4996 
4997   Label done;
4998   // Compare the destination lanes with float_sign_flip
4999   // value to get mask for all special values.
5000   movdqu(xtmp1, float_sign_flip, rscratch);
5001   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5002   ptest(xtmp2, xtmp2);
5003   jccb(Assembler::equal, done);
5004 
5005   // Flip float_sign_flip to get max integer value.
5006   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5007   pxor(xtmp1, xtmp4);
5008 
5009   // Set detination lanes corresponding to unordered source lanes as zero.
5010   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5011   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5012 
5013   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5014   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5015   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5016 
5017   // Recompute the mask for remaining special value.
5018   pxor(xtmp2, xtmp3);
5019   // Extract mask corresponding to non-negative source lanes.
5020   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5021 
5022   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5023   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5024   pand(xtmp3, xtmp2);
5025 
5026   // Replace destination lanes holding special value(0x80000000) with max int
5027   // if corresponding source lane holds a +ve value.
5028   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5029   bind(done);
5030 }
5031 
5032 
5033 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5034                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5035   switch(to_elem_bt) {
5036     case T_SHORT:
5037       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5038       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5039       vpackusdw(dst, dst, zero, vec_enc);
5040       if (vec_enc == Assembler::AVX_256bit) {
5041         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5042       }
5043       break;
5044     case  T_BYTE:
5045       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5046       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5047       vpackusdw(dst, dst, zero, vec_enc);
5048       if (vec_enc == Assembler::AVX_256bit) {
5049         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5050       }
5051       vpackuswb(dst, dst, zero, vec_enc);
5052       break;
5053     default: assert(false, "Unexpected basic type for target of vector cast int to subword: %s", type2name(to_elem_bt));
5054   }
5055 }
5056 
5057 /*
5058  * Algorithm for vector D2L and F2I conversions (AVX 10.2 unsupported):-
5059  * a) Perform vector D2L/F2I cast.
5060  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5061  *    It signifies that source value could be any of the special floating point
5062  *    values(NaN,-Inf,Inf,Max,-Min).
5063  * c) Set destination to zero if source is NaN value.
5064  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5065  */
5066 
5067 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5068                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5069                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5070   int to_elem_sz = type2aelembytes(to_elem_bt);
5071   assert(to_elem_sz <= 4, "");
5072   vcvttps2dq(dst, src, vec_enc);
5073   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5074   if (to_elem_sz < 4) {
5075     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5076     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5077   }
5078 }
5079 
5080 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5081                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5082                                             Register rscratch, int vec_enc) {
5083   int to_elem_sz = type2aelembytes(to_elem_bt);
5084   assert(to_elem_sz <= 4, "");
5085   vcvttps2dq(dst, src, vec_enc);
5086   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5087   switch(to_elem_bt) {
5088     case T_INT:
5089       break;
5090     case T_SHORT:
5091       evpmovdw(dst, dst, vec_enc);
5092       break;
5093     case T_BYTE:
5094       evpmovdb(dst, dst, vec_enc);
5095       break;
5096     default: assert(false, "Unexpected basic type for target of vector castF2X EVEX: %s", type2name(to_elem_bt));
5097   }
5098 }
5099 
5100 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5101                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5102                                             Register rscratch, int vec_enc) {
5103   evcvttps2qq(dst, src, vec_enc);
5104   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5105 }
5106 
5107 // Handling for downcasting from double to integer or sub-word types on AVX2.
5108 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5109                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5110                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5111   int to_elem_sz = type2aelembytes(to_elem_bt);
5112   assert(to_elem_sz < 8, "");
5113   vcvttpd2dq(dst, src, vec_enc);
5114   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5115                                               float_sign_flip, vec_enc);
5116   if (to_elem_sz < 4) {
5117     // xtmp4 holds all zero lanes.
5118     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5119   }
5120 }
5121 
5122 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5123                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5124                                             KRegister ktmp2, AddressLiteral sign_flip,
5125                                             Register rscratch, int vec_enc) {
5126   if (VM_Version::supports_avx512dq()) {
5127     evcvttpd2qq(dst, src, vec_enc);
5128     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5129     switch(to_elem_bt) {
5130       case T_LONG:
5131         break;
5132       case T_INT:
5133         evpmovsqd(dst, dst, vec_enc);
5134         break;
5135       case T_SHORT:
5136         evpmovsqd(dst, dst, vec_enc);
5137         evpmovdw(dst, dst, vec_enc);
5138         break;
5139       case T_BYTE:
5140         evpmovsqd(dst, dst, vec_enc);
5141         evpmovdb(dst, dst, vec_enc);
5142         break;
5143       default: assert(false, "Unexpected basic type for target of vector castD2X AVX512DQ EVEX: %s", type2name(to_elem_bt));
5144     }
5145   } else {
5146     assert(type2aelembytes(to_elem_bt) <= 4, "");
5147     vcvttpd2dq(dst, src, vec_enc);
5148     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5149     switch(to_elem_bt) {
5150       case T_INT:
5151         break;
5152       case T_SHORT:
5153         evpmovdw(dst, dst, vec_enc);
5154         break;
5155       case T_BYTE:
5156         evpmovdb(dst, dst, vec_enc);
5157         break;
5158       default: assert(false, "Unexpected basic type for target of vector castD2X EVEX: %s", type2name(to_elem_bt));
5159     }
5160   }
5161 }
5162 
5163 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5164   switch(to_elem_bt) {
5165     case T_LONG:
5166       evcvttps2qqs(dst, src, vec_enc);
5167       break;
5168     case T_INT:
5169       evcvttps2dqs(dst, src, vec_enc);
5170       break;
5171     case T_SHORT:
5172       evcvttps2dqs(dst, src, vec_enc);
5173       evpmovdw(dst, dst, vec_enc);
5174       break;
5175     case T_BYTE:
5176       evcvttps2dqs(dst, src, vec_enc);
5177       evpmovdb(dst, dst, vec_enc);
5178       break;
5179     default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (reg src): %s", type2name(to_elem_bt));
5180   }
5181 }
5182 
5183 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5184   switch(to_elem_bt) {
5185     case T_LONG:
5186       evcvttps2qqs(dst, src, vec_enc);
5187       break;
5188     case T_INT:
5189       evcvttps2dqs(dst, src, vec_enc);
5190       break;
5191     case T_SHORT:
5192       evcvttps2dqs(dst, src, vec_enc);
5193       evpmovdw(dst, dst, vec_enc);
5194       break;
5195     case T_BYTE:
5196       evcvttps2dqs(dst, src, vec_enc);
5197       evpmovdb(dst, dst, vec_enc);
5198       break;
5199     default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (mem src): %s", type2name(to_elem_bt));
5200   }
5201 }
5202 
5203 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5204   switch(to_elem_bt) {
5205     case T_LONG:
5206       evcvttpd2qqs(dst, src, vec_enc);
5207       break;
5208     case T_INT:
5209       evcvttpd2dqs(dst, src, vec_enc);
5210       break;
5211     case T_SHORT:
5212       evcvttpd2dqs(dst, src, vec_enc);
5213       evpmovdw(dst, dst, vec_enc);
5214       break;
5215     case T_BYTE:
5216       evcvttpd2dqs(dst, src, vec_enc);
5217       evpmovdb(dst, dst, vec_enc);
5218       break;
5219     default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (reg src): %s", type2name(to_elem_bt));
5220   }
5221 }
5222 
5223 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5224   switch(to_elem_bt) {
5225     case T_LONG:
5226       evcvttpd2qqs(dst, src, vec_enc);
5227       break;
5228     case T_INT:
5229       evcvttpd2dqs(dst, src, vec_enc);
5230       break;
5231     case T_SHORT:
5232       evcvttpd2dqs(dst, src, vec_enc);
5233       evpmovdw(dst, dst, vec_enc);
5234       break;
5235     case T_BYTE:
5236       evcvttpd2dqs(dst, src, vec_enc);
5237       evpmovdb(dst, dst, vec_enc);
5238       break;
5239     default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (mem src): %s", type2name(to_elem_bt));
5240   }
5241 }
5242 
5243 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5244                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5245                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5246   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5247   // and re-instantiate original MXCSR.RC mode after that.
5248   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5249 
5250   mov64(tmp, julong_cast(0.5L));
5251   evpbroadcastq(xtmp1, tmp, vec_enc);
5252   vaddpd(xtmp1, src , xtmp1, vec_enc);
5253   evcvtpd2qq(dst, xtmp1, vec_enc);
5254   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5255                                                 double_sign_flip, vec_enc);;
5256 
5257   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5258 }
5259 
5260 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5261                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5262                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5263   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5264   // and re-instantiate original MXCSR.RC mode after that.
5265   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5266 
5267   movl(tmp, jint_cast(0.5));
5268   movq(xtmp1, tmp);
5269   vbroadcastss(xtmp1, xtmp1, vec_enc);
5270   vaddps(xtmp1, src , xtmp1, vec_enc);
5271   vcvtps2dq(dst, xtmp1, vec_enc);
5272   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5273                                               float_sign_flip, vec_enc);
5274 
5275   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5276 }
5277 
5278 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5279                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5280                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5281   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5282   // and re-instantiate original MXCSR.RC mode after that.
5283   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5284 
5285   movl(tmp, jint_cast(0.5));
5286   movq(xtmp1, tmp);
5287   vbroadcastss(xtmp1, xtmp1, vec_enc);
5288   vaddps(xtmp1, src , xtmp1, vec_enc);
5289   vcvtps2dq(dst, xtmp1, vec_enc);
5290   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5291 
5292   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5293 }
5294 
5295 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5296                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5297   switch (from_elem_bt) {
5298     case T_BYTE:
5299       switch (to_elem_bt) {
5300         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5301         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5302         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5303         default: ShouldNotReachHere();
5304       }
5305       break;
5306     case T_SHORT:
5307       switch (to_elem_bt) {
5308         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5309         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5310         default: ShouldNotReachHere();
5311       }
5312       break;
5313     case T_INT:
5314       assert(to_elem_bt == T_LONG, "");
5315       vpmovzxdq(dst, src, vlen_enc);
5316       break;
5317     default:
5318       ShouldNotReachHere();
5319   }
5320 }
5321 
5322 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5323                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5324   switch (from_elem_bt) {
5325     case T_BYTE:
5326       switch (to_elem_bt) {
5327         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5328         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5329         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5330         default: ShouldNotReachHere();
5331       }
5332       break;
5333     case T_SHORT:
5334       switch (to_elem_bt) {
5335         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5336         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5337         default: ShouldNotReachHere();
5338       }
5339       break;
5340     case T_INT:
5341       assert(to_elem_bt == T_LONG, "");
5342       vpmovsxdq(dst, src, vlen_enc);
5343       break;
5344     default:
5345       ShouldNotReachHere();
5346   }
5347 }
5348 
5349 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5350                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5351   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5352   assert(vlen_enc != AVX_512bit, "");
5353 
5354   int dst_bt_size = type2aelembytes(dst_bt);
5355   int src_bt_size = type2aelembytes(src_bt);
5356   if (dst_bt_size > src_bt_size) {
5357     switch (dst_bt_size / src_bt_size) {
5358       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5359       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5360       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5361       default: ShouldNotReachHere();
5362     }
5363   } else {
5364     assert(dst_bt_size < src_bt_size, "");
5365     switch (src_bt_size / dst_bt_size) {
5366       case 2: {
5367         if (vlen_enc == AVX_128bit) {
5368           vpacksswb(dst, src, src, vlen_enc);
5369         } else {
5370           vpacksswb(dst, src, src, vlen_enc);
5371           vpermq(dst, dst, 0x08, vlen_enc);
5372         }
5373         break;
5374       }
5375       case 4: {
5376         if (vlen_enc == AVX_128bit) {
5377           vpackssdw(dst, src, src, vlen_enc);
5378           vpacksswb(dst, dst, dst, vlen_enc);
5379         } else {
5380           vpackssdw(dst, src, src, vlen_enc);
5381           vpermq(dst, dst, 0x08, vlen_enc);
5382           vpacksswb(dst, dst, dst, AVX_128bit);
5383         }
5384         break;
5385       }
5386       case 8: {
5387         if (vlen_enc == AVX_128bit) {
5388           vpshufd(dst, src, 0x08, vlen_enc);
5389           vpackssdw(dst, dst, dst, vlen_enc);
5390           vpacksswb(dst, dst, dst, vlen_enc);
5391         } else {
5392           vpshufd(dst, src, 0x08, vlen_enc);
5393           vpermq(dst, dst, 0x08, vlen_enc);
5394           vpackssdw(dst, dst, dst, AVX_128bit);
5395           vpacksswb(dst, dst, dst, AVX_128bit);
5396         }
5397         break;
5398       }
5399       default: ShouldNotReachHere();
5400     }
5401   }
5402 }
5403 
5404 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5405                                    bool merge, BasicType bt, int vlen_enc) {
5406   if (bt == T_INT) {
5407     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5408   } else {
5409     assert(bt == T_LONG, "");
5410     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5411   }
5412 }
5413 
5414 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5415                                    bool merge, BasicType bt, int vlen_enc) {
5416   if (bt == T_INT) {
5417     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5418   } else {
5419     assert(bt == T_LONG, "");
5420     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5421   }
5422 }
5423 
5424 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5425                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5426                                                int vec_enc) {
5427   int index = 0;
5428   int vindex = 0;
5429   mov64(rtmp1, 0x0101010101010101L);
5430   pdepq(rtmp1, src, rtmp1);
5431   if (mask_len > 8) {
5432     movq(rtmp2, src);
5433     vpxor(xtmp, xtmp, xtmp, vec_enc);
5434     movq(xtmp, rtmp1);
5435   }
5436   movq(dst, rtmp1);
5437 
5438   mask_len -= 8;
5439   while (mask_len > 0) {
5440     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5441     index++;
5442     if ((index % 2) == 0) {
5443       pxor(xtmp, xtmp);
5444     }
5445     mov64(rtmp1, 0x0101010101010101L);
5446     shrq(rtmp2, 8);
5447     pdepq(rtmp1, rtmp2, rtmp1);
5448     pinsrq(xtmp, rtmp1, index % 2);
5449     vindex = index / 2;
5450     if (vindex) {
5451       // Write entire 16 byte vector when both 64 bit
5452       // lanes are update to save redundant instructions.
5453       if (index % 2) {
5454         vinsertf128(dst, dst, xtmp, vindex);
5455       }
5456     } else {
5457       vmovdqu(dst, xtmp);
5458     }
5459     mask_len -= 8;
5460   }
5461 }
5462 
5463 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5464   switch(opc) {
5465     case Op_VectorMaskTrueCount:
5466       popcntq(dst, tmp);
5467       break;
5468     case Op_VectorMaskLastTrue:
5469       if (VM_Version::supports_lzcnt()) {
5470         lzcntq(tmp, tmp);
5471         movl(dst, 63);
5472         subl(dst, tmp);
5473       } else {
5474         movl(dst, -1);
5475         bsrq(tmp, tmp);
5476         cmov32(Assembler::notZero, dst, tmp);
5477       }
5478       break;
5479     case Op_VectorMaskFirstTrue:
5480       if (VM_Version::supports_bmi1()) {
5481         if (masklen < 32) {
5482           orl(tmp, 1 << masklen);
5483           tzcntl(dst, tmp);
5484         } else if (masklen == 32) {
5485           tzcntl(dst, tmp);
5486         } else {
5487           assert(masklen == 64, "");
5488           tzcntq(dst, tmp);
5489         }
5490       } else {
5491         if (masklen < 32) {
5492           orl(tmp, 1 << masklen);
5493           bsfl(dst, tmp);
5494         } else {
5495           assert(masklen == 32 || masklen == 64, "");
5496           movl(dst, masklen);
5497           if (masklen == 32)  {
5498             bsfl(tmp, tmp);
5499           } else {
5500             bsfq(tmp, tmp);
5501           }
5502           cmov32(Assembler::notZero, dst, tmp);
5503         }
5504       }
5505       break;
5506     case Op_VectorMaskToLong:
5507       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5508       break;
5509     default: assert(false, "Unhandled mask operation");
5510   }
5511 }
5512 
5513 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5514                                               int masklen, int masksize, int vec_enc) {
5515   assert(VM_Version::supports_popcnt(), "");
5516 
5517   if(VM_Version::supports_avx512bw()) {
5518     kmovql(tmp, mask);
5519   } else {
5520     assert(masklen <= 16, "");
5521     kmovwl(tmp, mask);
5522   }
5523 
5524   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5525   // operations needs to be clipped.
5526   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5527     andq(tmp, (1 << masklen) - 1);
5528   }
5529 
5530   vector_mask_operation_helper(opc, dst, tmp, masklen);
5531 }
5532 
5533 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5534                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5535   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5536          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5537   assert(VM_Version::supports_popcnt(), "");
5538 
5539   bool need_clip = false;
5540   switch(bt) {
5541     case T_BOOLEAN:
5542       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5543       vpxor(xtmp, xtmp, xtmp, vec_enc);
5544       vpsubb(xtmp, xtmp, mask, vec_enc);
5545       vpmovmskb(tmp, xtmp, vec_enc);
5546       need_clip = masklen < 16;
5547       break;
5548     case T_BYTE:
5549       vpmovmskb(tmp, mask, vec_enc);
5550       need_clip = masklen < 16;
5551       break;
5552     case T_SHORT:
5553       vpacksswb(xtmp, mask, mask, vec_enc);
5554       if (masklen >= 16) {
5555         vpermpd(xtmp, xtmp, 8, vec_enc);
5556       }
5557       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5558       need_clip = masklen < 16;
5559       break;
5560     case T_INT:
5561     case T_FLOAT:
5562       vmovmskps(tmp, mask, vec_enc);
5563       need_clip = masklen < 4;
5564       break;
5565     case T_LONG:
5566     case T_DOUBLE:
5567       vmovmskpd(tmp, mask, vec_enc);
5568       need_clip = masklen < 2;
5569       break;
5570     default: assert(false, "Unhandled type, %s", type2name(bt));
5571   }
5572 
5573   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5574   // operations needs to be clipped.
5575   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5576     // need_clip implies masklen < 32
5577     andq(tmp, (1 << masklen) - 1);
5578   }
5579 
5580   vector_mask_operation_helper(opc, dst, tmp, masklen);
5581 }
5582 
5583 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5584                                              Register rtmp2, int mask_len) {
5585   kmov(rtmp1, src);
5586   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5587   mov64(rtmp2, -1L);
5588   pextq(rtmp2, rtmp2, rtmp1);
5589   kmov(dst, rtmp2);
5590 }
5591 
5592 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5593                                                     XMMRegister mask, Register rtmp, Register rscratch,
5594                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5595                                                     int vec_enc) {
5596   assert(type2aelembytes(bt) >= 4, "");
5597   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5598   address compress_perm_table = nullptr;
5599   address expand_perm_table = nullptr;
5600   if (type2aelembytes(bt) == 8) {
5601     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5602     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5603     vmovmskpd(rtmp, mask, vec_enc);
5604   } else {
5605     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5606     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5607     vmovmskps(rtmp, mask, vec_enc);
5608   }
5609   shlq(rtmp, 5); // for 32 byte permute row.
5610   if (opcode == Op_CompressV) {
5611     lea(rscratch, ExternalAddress(compress_perm_table));
5612   } else {
5613     lea(rscratch, ExternalAddress(expand_perm_table));
5614   }
5615   addptr(rtmp, rscratch);
5616   vmovdqu(permv, Address(rtmp));
5617   vpermps(dst, permv, src, Assembler::AVX_256bit);
5618   vpxor(xtmp, xtmp, xtmp, vec_enc);
5619   // Blend the result with zero vector using permute mask, each column entry
5620   // in a permute table row contains either a valid permute index or a -1 (default)
5621   // value, this can potentially be used as a blending mask after
5622   // compressing/expanding the source vector lanes.
5623   vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv);
5624 }
5625 
5626 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5627                                                bool merge, BasicType bt, int vec_enc) {
5628   if (opcode == Op_CompressV) {
5629     switch(bt) {
5630     case T_BYTE:
5631       evpcompressb(dst, mask, src, merge, vec_enc);
5632       break;
5633     case T_CHAR:
5634     case T_SHORT:
5635       evpcompressw(dst, mask, src, merge, vec_enc);
5636       break;
5637     case T_INT:
5638       evpcompressd(dst, mask, src, merge, vec_enc);
5639       break;
5640     case T_FLOAT:
5641       evcompressps(dst, mask, src, merge, vec_enc);
5642       break;
5643     case T_LONG:
5644       evpcompressq(dst, mask, src, merge, vec_enc);
5645       break;
5646     case T_DOUBLE:
5647       evcompresspd(dst, mask, src, merge, vec_enc);
5648       break;
5649     default:
5650       fatal("Unsupported type %s", type2name(bt));
5651       break;
5652     }
5653   } else {
5654     assert(opcode == Op_ExpandV, "");
5655     switch(bt) {
5656     case T_BYTE:
5657       evpexpandb(dst, mask, src, merge, vec_enc);
5658       break;
5659     case T_CHAR:
5660     case T_SHORT:
5661       evpexpandw(dst, mask, src, merge, vec_enc);
5662       break;
5663     case T_INT:
5664       evpexpandd(dst, mask, src, merge, vec_enc);
5665       break;
5666     case T_FLOAT:
5667       evexpandps(dst, mask, src, merge, vec_enc);
5668       break;
5669     case T_LONG:
5670       evpexpandq(dst, mask, src, merge, vec_enc);
5671       break;
5672     case T_DOUBLE:
5673       evexpandpd(dst, mask, src, merge, vec_enc);
5674       break;
5675     default:
5676       fatal("Unsupported type %s", type2name(bt));
5677       break;
5678     }
5679   }
5680 }
5681 
5682 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5683                                            KRegister ktmp1, int vec_enc) {
5684   if (opcode == Op_SignumVD) {
5685     vsubpd(dst, zero, one, vec_enc);
5686     // if src < 0 ? -1 : 1
5687     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5688     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5689     // if src == NaN, -0.0 or 0.0 return src.
5690     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5691     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5692   } else {
5693     assert(opcode == Op_SignumVF, "");
5694     vsubps(dst, zero, one, vec_enc);
5695     // if src < 0 ? -1 : 1
5696     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5697     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5698     // if src == NaN, -0.0 or 0.0 return src.
5699     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5700     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5701   }
5702 }
5703 
5704 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5705                                           XMMRegister xtmp1, int vec_enc) {
5706   if (opcode == Op_SignumVD) {
5707     vsubpd(dst, zero, one, vec_enc);
5708     // if src < 0 ? -1 : 1
5709     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5710     // if src == NaN, -0.0 or 0.0 return src.
5711     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5712     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5713   } else {
5714     assert(opcode == Op_SignumVF, "");
5715     vsubps(dst, zero, one, vec_enc);
5716     // if src < 0 ? -1 : 1
5717     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5718     // if src == NaN, -0.0 or 0.0 return src.
5719     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5720     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5721   }
5722 }
5723 
5724 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5725   if (VM_Version::supports_avx512bw()) {
5726     if (mask_len > 32) {
5727       kmovql(dst, src);
5728     } else {
5729       kmovdl(dst, src);
5730       if (mask_len != 32) {
5731         kshiftrdl(dst, dst, 32 - mask_len);
5732       }
5733     }
5734   } else {
5735     assert(mask_len <= 16, "");
5736     kmovwl(dst, src);
5737     if (mask_len != 16) {
5738       kshiftrwl(dst, dst, 16 - mask_len);
5739     }
5740   }
5741 }
5742 
5743 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5744   int lane_size = type2aelembytes(bt);
5745   if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5746       (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) {
5747     movptr(rtmp, imm32);
5748     switch(lane_size) {
5749       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5750       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5751       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5752       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5753       fatal("Unsupported lane size %d", lane_size);
5754       break;
5755     }
5756   } else {
5757     movptr(rtmp, imm32);
5758     movq(dst, rtmp);
5759     switch(lane_size) {
5760       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5761       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5762       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5763       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5764       fatal("Unsupported lane size %d", lane_size);
5765       break;
5766     }
5767   }
5768 }
5769 
5770 //
5771 // Following is lookup table based popcount computation algorithm:-
5772 //       Index   Bit set count
5773 //     [ 0000 ->   0,
5774 //       0001 ->   1,
5775 //       0010 ->   1,
5776 //       0011 ->   2,
5777 //       0100 ->   1,
5778 //       0101 ->   2,
5779 //       0110 ->   2,
5780 //       0111 ->   3,
5781 //       1000 ->   1,
5782 //       1001 ->   2,
5783 //       1010 ->   3,
5784 //       1011 ->   3,
5785 //       1100 ->   2,
5786 //       1101 ->   3,
5787 //       1111 ->   4 ]
5788 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5789 //     shuffle indices for lookup table access.
5790 //  b. Right shift each byte of vector lane by 4 positions.
5791 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5792 //     shuffle indices for lookup table access.
5793 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5794 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5795 //     count of all the bytes of a quadword.
5796 //  f. Perform step e. for upper 128bit vector lane.
5797 //  g. Pack the bitset count of quadwords back to double word.
5798 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5799 
5800 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5801                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5802   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5803   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5804   vpsrlw(dst, src, 4, vec_enc);
5805   vpand(dst, dst, xtmp1, vec_enc);
5806   vpand(xtmp1, src, xtmp1, vec_enc);
5807   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5808   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5809   vpshufb(dst, xtmp2, dst, vec_enc);
5810   vpaddb(dst, dst, xtmp1, vec_enc);
5811 }
5812 
5813 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5814                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5815   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5816   // Following code is as per steps e,f,g and h of above algorithm.
5817   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5818   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5819   vpsadbw(dst, dst, xtmp2, vec_enc);
5820   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5821   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5822   vpackuswb(dst, xtmp1, dst, vec_enc);
5823 }
5824 
5825 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5826                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5827   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5828   // Add the popcount of upper and lower bytes of word.
5829   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5830   vpsrlw(dst, xtmp1, 8, vec_enc);
5831   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5832   vpaddw(dst, dst, xtmp1, vec_enc);
5833 }
5834 
5835 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5836                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5837   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5838   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5839   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5840 }
5841 
5842 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5843                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5844   switch(bt) {
5845     case T_LONG:
5846       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5847       break;
5848     case T_INT:
5849       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5850       break;
5851     case T_CHAR:
5852     case T_SHORT:
5853       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5854       break;
5855     case T_BYTE:
5856     case T_BOOLEAN:
5857       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5858       break;
5859     default:
5860       fatal("Unsupported type %s", type2name(bt));
5861       break;
5862   }
5863 }
5864 
5865 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5866                                                       KRegister mask, bool merge, int vec_enc) {
5867   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5868   switch(bt) {
5869     case T_LONG:
5870       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5871       evpopcntq(dst, mask, src, merge, vec_enc);
5872       break;
5873     case T_INT:
5874       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5875       evpopcntd(dst, mask, src, merge, vec_enc);
5876       break;
5877     case T_CHAR:
5878     case T_SHORT:
5879       assert(VM_Version::supports_avx512_bitalg(), "");
5880       evpopcntw(dst, mask, src, merge, vec_enc);
5881       break;
5882     case T_BYTE:
5883     case T_BOOLEAN:
5884       assert(VM_Version::supports_avx512_bitalg(), "");
5885       evpopcntb(dst, mask, src, merge, vec_enc);
5886       break;
5887     default:
5888       fatal("Unsupported type %s", type2name(bt));
5889       break;
5890   }
5891 }
5892 
5893 // Bit reversal algorithm first reverses the bits of each byte followed by
5894 // a byte level reversal for multi-byte primitive types (short/int/long).
5895 // Algorithm performs a lookup table access to get reverse bit sequence
5896 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5897 // is obtained by swapping the reverse bit sequences of upper and lower
5898 // nibble of a byte.
5899 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5900                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5901   if (VM_Version::supports_avx512vlbw()) {
5902 
5903     // Get the reverse bit sequence of lower nibble of each byte.
5904     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5905     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5906     evpandq(dst, xtmp2, src, vec_enc);
5907     vpshufb(dst, xtmp1, dst, vec_enc);
5908     vpsllq(dst, dst, 4, vec_enc);
5909 
5910     // Get the reverse bit sequence of upper nibble of each byte.
5911     vpandn(xtmp2, xtmp2, src, vec_enc);
5912     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5913     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5914 
5915     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5916     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5917     evporq(xtmp2, dst, xtmp2, vec_enc);
5918     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5919 
5920   } else if(vec_enc == Assembler::AVX_512bit) {
5921     // Shift based bit reversal.
5922     assert(bt == T_LONG || bt == T_INT, "");
5923 
5924     // Swap lower and upper nibble of each byte.
5925     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5926 
5927     // Swap two least and most significant bits of each nibble.
5928     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5929 
5930     // Swap adjacent pair of bits.
5931     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5932     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5933 
5934     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5935     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5936   } else {
5937     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5938     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5939 
5940     // Get the reverse bit sequence of lower nibble of each byte.
5941     vpand(dst, xtmp2, src, vec_enc);
5942     vpshufb(dst, xtmp1, dst, vec_enc);
5943     vpsllq(dst, dst, 4, vec_enc);
5944 
5945     // Get the reverse bit sequence of upper nibble of each byte.
5946     vpandn(xtmp2, xtmp2, src, vec_enc);
5947     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5948     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5949 
5950     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5951     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5952     vpor(xtmp2, dst, xtmp2, vec_enc);
5953     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5954   }
5955 }
5956 
5957 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5958                                                 XMMRegister xtmp, Register rscratch) {
5959   assert(VM_Version::supports_gfni(), "");
5960   assert(rscratch != noreg || always_reachable(mask), "missing");
5961 
5962   // Galois field instruction based bit reversal based on following algorithm.
5963   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5964   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5965   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5966   vector_reverse_byte(bt, dst, xtmp, vec_enc);
5967 }
5968 
5969 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5970                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
5971   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5972   evpandq(dst, xtmp1, src, vec_enc);
5973   vpsllq(dst, dst, nbits, vec_enc);
5974   vpandn(xtmp1, xtmp1, src, vec_enc);
5975   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
5976   evporq(dst, dst, xtmp1, vec_enc);
5977 }
5978 
5979 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5980                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5981   // Shift based bit reversal.
5982   assert(VM_Version::supports_evex(), "");
5983   switch(bt) {
5984     case T_LONG:
5985       // Swap upper and lower double word of each quad word.
5986       evprorq(xtmp1, k0, src, 32, true, vec_enc);
5987       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
5988       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5989       break;
5990     case T_INT:
5991       // Swap upper and lower word of each double word.
5992       evprord(xtmp1, k0, src, 16, true, vec_enc);
5993       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5994       break;
5995     case T_CHAR:
5996     case T_SHORT:
5997       // Swap upper and lower byte of each word.
5998       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
5999       break;
6000     case T_BYTE:
6001       evmovdquq(dst, k0, src, true, vec_enc);
6002       break;
6003     default:
6004       fatal("Unsupported type %s", type2name(bt));
6005       break;
6006   }
6007 }
6008 
6009 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6010   if (bt == T_BYTE) {
6011     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6012       evmovdquq(dst, k0, src, true, vec_enc);
6013     } else {
6014       vmovdqu(dst, src);
6015     }
6016     return;
6017   }
6018   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6019   // pre-computed shuffle indices.
6020   switch(bt) {
6021     case T_LONG:
6022       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6023       break;
6024     case T_INT:
6025       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6026       break;
6027     case T_CHAR:
6028     case T_SHORT:
6029       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6030       break;
6031     default:
6032       fatal("Unsupported type %s", type2name(bt));
6033       break;
6034   }
6035   vpshufb(dst, src, dst, vec_enc);
6036 }
6037 
6038 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6039                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6040                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6041   assert(is_integral_type(bt), "");
6042   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6043   assert(VM_Version::supports_avx512cd(), "");
6044   switch(bt) {
6045     case T_LONG:
6046       evplzcntq(dst, ktmp, src, merge, vec_enc);
6047       break;
6048     case T_INT:
6049       evplzcntd(dst, ktmp, src, merge, vec_enc);
6050       break;
6051     case T_SHORT:
6052       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6053       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6054       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6055       vpunpckhwd(dst, xtmp1, src, vec_enc);
6056       evplzcntd(dst, ktmp, dst, merge, vec_enc);
6057       vpackusdw(dst, xtmp2, dst, vec_enc);
6058       break;
6059     case T_BYTE:
6060       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6061       // accessing the lookup table.
6062       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6063       // accessing the lookup table.
6064       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6065       assert(VM_Version::supports_avx512bw(), "");
6066       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6067       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6068       vpand(xtmp2, dst, src, vec_enc);
6069       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6070       vpsrlw(xtmp3, src, 4, vec_enc);
6071       vpand(xtmp3, dst, xtmp3, vec_enc);
6072       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6073       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6074       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6075       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6076       break;
6077     default:
6078       fatal("Unsupported type %s", type2name(bt));
6079       break;
6080   }
6081 }
6082 
6083 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6084                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6085   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6086   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6087   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6088   // accessing the lookup table.
6089   vpand(dst, xtmp2, src, vec_enc);
6090   vpshufb(dst, xtmp1, dst, vec_enc);
6091   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6092   // accessing the lookup table.
6093   vpsrlw(xtmp3, src, 4, vec_enc);
6094   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6095   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6096   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6097   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6098   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6099   vpaddb(dst, dst, xtmp2, vec_enc);
6100   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6101 }
6102 
6103 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6104                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6105   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6106   // Add zero counts of lower byte and upper byte of a word if
6107   // upper byte holds a zero value.
6108   vpsrlw(xtmp3, src, 8, vec_enc);
6109   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6110   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6111   vpsllw(xtmp2, dst, 8, vec_enc);
6112   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6113   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6114   vpsrlw(dst, dst, 8, vec_enc);
6115 }
6116 
6117 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6118                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6119   // By converting the integer to a float, we can obtain the number of leading zeros based on the exponent of the float.
6120   // As the float exponent contains a bias of 127 for nonzero values, the bias must be removed before interpreting the
6121   // exponent as the leading zero count.
6122 
6123   // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher
6124   // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit
6125   // contributes to the leading number of zeros.
6126   vpsrld(dst, src, 1, vec_enc);
6127   vpandn(dst, dst, src, vec_enc);
6128 
6129   vcvtdq2ps(dst, dst, vec_enc);
6130 
6131   // By comparing the register to itself, all the bits in the destination are set.
6132   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6133 
6134   // Move the biased exponent to the low end of the lane and mask with 0xFF to discard the sign bit.
6135   vpsrld(xtmp2, xtmp1, 24, vec_enc);
6136   vpsrld(dst, dst, 23, vec_enc);
6137   vpand(dst, xtmp2, dst, vec_enc);
6138 
6139   // Subtract 127 from the exponent, which removes the bias from the exponent.
6140   vpsrld(xtmp2, xtmp1, 25, vec_enc);
6141   vpsubd(dst, dst, xtmp2, vec_enc);
6142 
6143   vpsrld(xtmp2, xtmp1, 27, vec_enc);
6144 
6145   // If the original value is 0 the exponent would not have bias, so the subtraction creates a negative number. If this
6146   // is found in any of the lanes, replace the lane with -1 from xtmp1.
6147   vblendvps(dst, dst, xtmp1, dst, vec_enc, true, xtmp3);
6148 
6149   // If the original value is negative, replace the lane with 31.
6150   vblendvps(dst, dst, xtmp2, src, vec_enc, true, xtmp3);
6151 
6152   // Subtract the exponent from 31, giving the final result. For 0, the result is 32 as the exponent was replaced with -1,
6153   // and for negative numbers the result is 0 as the exponent was replaced with 31.
6154   vpsubd(dst, xtmp2, dst, vec_enc);
6155 }
6156 
6157 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6158                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6159   // Find the leading zeros of the top and bottom halves of the long individually.
6160   vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6161 
6162   // Move the top half result to the bottom half of xtmp1, setting the top half to 0.
6163   vpsrlq(xtmp1, dst, 32, vec_enc);
6164   // By moving the top half result to the right by 6 bits, if the top half was empty (i.e. 32 is returned) the result bit will
6165   // be in the most significant position of the bottom half.
6166   vpsrlq(xtmp2, dst, 6, vec_enc);
6167 
6168   // In the bottom half, add the top half and bottom half results.
6169   vpaddq(dst, xtmp1, dst, vec_enc);
6170 
6171   // For the bottom half, choose between the values using the most significant bit of xtmp2.
6172   // If the MSB is set, then bottom+top in dst is the resulting value. If the top half is less than 32 xtmp1 is chosen,
6173   // which contains only the top half result.
6174   // In the top half the MSB is always zero, so the value in xtmp1 is always chosen. This value is always 0, which clears
6175   // the lane as required.
6176   vblendvps(dst, xtmp1, dst, xtmp2, vec_enc, true, xtmp3);
6177 }
6178 
6179 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6180                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6181                                                        Register rtmp, int vec_enc) {
6182   assert(is_integral_type(bt), "unexpected type");
6183   assert(vec_enc < Assembler::AVX_512bit, "");
6184   switch(bt) {
6185     case T_LONG:
6186       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6187       break;
6188     case T_INT:
6189       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6190       break;
6191     case T_SHORT:
6192       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6193       break;
6194     case T_BYTE:
6195       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6196       break;
6197     default:
6198       fatal("Unsupported type %s", type2name(bt));
6199       break;
6200   }
6201 }
6202 
6203 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6204   switch(bt) {
6205     case T_BYTE:
6206       vpsubb(dst, src1, src2, vec_enc);
6207       break;
6208     case T_SHORT:
6209       vpsubw(dst, src1, src2, vec_enc);
6210       break;
6211     case T_INT:
6212       vpsubd(dst, src1, src2, vec_enc);
6213       break;
6214     case T_LONG:
6215       vpsubq(dst, src1, src2, vec_enc);
6216       break;
6217     default:
6218       fatal("Unsupported type %s", type2name(bt));
6219       break;
6220   }
6221 }
6222 
6223 // Trailing zero count computation is based on leading zero count operation as per
6224 // following equation. All AVX3 targets support AVX512CD feature which offers
6225 // direct vector instruction to compute leading zero count.
6226 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6227 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6228                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6229                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6230   assert(is_integral_type(bt), "");
6231   // xtmp = -1
6232   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6233   // xtmp = xtmp + src
6234   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6235   // xtmp = xtmp & ~src
6236   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6237   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6238   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6239   vpsub(bt, dst, xtmp4, dst, vec_enc);
6240 }
6241 
6242 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6243 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6244 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6245                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6246   assert(is_integral_type(bt), "");
6247   // xtmp = 0
6248   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6249   // xtmp = 0 - src
6250   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6251   // xtmp = xtmp | src
6252   vpor(xtmp3, xtmp3, src, vec_enc);
6253   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6254   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6255   vpsub(bt, dst, xtmp1, dst, vec_enc);
6256 }
6257 
6258 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6259   Label done;
6260   Label neg_divisor_fastpath;
6261   cmpl(divisor, 0);
6262   jccb(Assembler::less, neg_divisor_fastpath);
6263   xorl(rdx, rdx);
6264   divl(divisor);
6265   jmpb(done);
6266   bind(neg_divisor_fastpath);
6267   // Fastpath for divisor < 0:
6268   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6269   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6270   movl(rdx, rax);
6271   subl(rdx, divisor);
6272   if (VM_Version::supports_bmi1()) {
6273     andnl(rax, rdx, rax);
6274   } else {
6275     notl(rdx);
6276     andl(rax, rdx);
6277   }
6278   shrl(rax, 31);
6279   bind(done);
6280 }
6281 
6282 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6283   Label done;
6284   Label neg_divisor_fastpath;
6285   cmpl(divisor, 0);
6286   jccb(Assembler::less, neg_divisor_fastpath);
6287   xorl(rdx, rdx);
6288   divl(divisor);
6289   jmpb(done);
6290   bind(neg_divisor_fastpath);
6291   // Fastpath when divisor < 0:
6292   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6293   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6294   movl(rdx, rax);
6295   subl(rax, divisor);
6296   if (VM_Version::supports_bmi1()) {
6297     andnl(rax, rax, rdx);
6298   } else {
6299     notl(rax);
6300     andl(rax, rdx);
6301   }
6302   sarl(rax, 31);
6303   andl(rax, divisor);
6304   subl(rdx, rax);
6305   bind(done);
6306 }
6307 
6308 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6309   Label done;
6310   Label neg_divisor_fastpath;
6311 
6312   cmpl(divisor, 0);
6313   jccb(Assembler::less, neg_divisor_fastpath);
6314   xorl(rdx, rdx);
6315   divl(divisor);
6316   jmpb(done);
6317   bind(neg_divisor_fastpath);
6318   // Fastpath for divisor < 0:
6319   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6320   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6321   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6322   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6323   movl(rdx, rax);
6324   subl(rax, divisor);
6325   if (VM_Version::supports_bmi1()) {
6326     andnl(rax, rax, rdx);
6327   } else {
6328     notl(rax);
6329     andl(rax, rdx);
6330   }
6331   movl(tmp, rax);
6332   shrl(rax, 31); // quotient
6333   sarl(tmp, 31);
6334   andl(tmp, divisor);
6335   subl(rdx, tmp); // remainder
6336   bind(done);
6337 }
6338 
6339 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6340                                  XMMRegister xtmp2, Register rtmp) {
6341   if(VM_Version::supports_gfni()) {
6342     // Galois field instruction based bit reversal based on following algorithm.
6343     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6344     mov64(rtmp, 0x8040201008040201L);
6345     movq(xtmp1, src);
6346     movq(xtmp2, rtmp);
6347     gf2p8affineqb(xtmp1, xtmp2, 0);
6348     movq(dst, xtmp1);
6349   } else {
6350     // Swap even and odd numbered bits.
6351     movl(rtmp, src);
6352     andl(rtmp, 0x55555555);
6353     shll(rtmp, 1);
6354     movl(dst, src);
6355     andl(dst, 0xAAAAAAAA);
6356     shrl(dst, 1);
6357     orl(dst, rtmp);
6358 
6359     // Swap LSB and MSB 2 bits of each nibble.
6360     movl(rtmp, dst);
6361     andl(rtmp, 0x33333333);
6362     shll(rtmp, 2);
6363     andl(dst, 0xCCCCCCCC);
6364     shrl(dst, 2);
6365     orl(dst, rtmp);
6366 
6367     // Swap LSB and MSB 4 bits of each byte.
6368     movl(rtmp, dst);
6369     andl(rtmp, 0x0F0F0F0F);
6370     shll(rtmp, 4);
6371     andl(dst, 0xF0F0F0F0);
6372     shrl(dst, 4);
6373     orl(dst, rtmp);
6374   }
6375   bswapl(dst);
6376 }
6377 
6378 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6379                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6380   if(VM_Version::supports_gfni()) {
6381     // Galois field instruction based bit reversal based on following algorithm.
6382     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6383     mov64(rtmp1, 0x8040201008040201L);
6384     movq(xtmp1, src);
6385     movq(xtmp2, rtmp1);
6386     gf2p8affineqb(xtmp1, xtmp2, 0);
6387     movq(dst, xtmp1);
6388   } else {
6389     // Swap even and odd numbered bits.
6390     movq(rtmp1, src);
6391     mov64(rtmp2, 0x5555555555555555L);
6392     andq(rtmp1, rtmp2);
6393     shlq(rtmp1, 1);
6394     movq(dst, src);
6395     notq(rtmp2);
6396     andq(dst, rtmp2);
6397     shrq(dst, 1);
6398     orq(dst, rtmp1);
6399 
6400     // Swap LSB and MSB 2 bits of each nibble.
6401     movq(rtmp1, dst);
6402     mov64(rtmp2, 0x3333333333333333L);
6403     andq(rtmp1, rtmp2);
6404     shlq(rtmp1, 2);
6405     notq(rtmp2);
6406     andq(dst, rtmp2);
6407     shrq(dst, 2);
6408     orq(dst, rtmp1);
6409 
6410     // Swap LSB and MSB 4 bits of each byte.
6411     movq(rtmp1, dst);
6412     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6413     andq(rtmp1, rtmp2);
6414     shlq(rtmp1, 4);
6415     notq(rtmp2);
6416     andq(dst, rtmp2);
6417     shrq(dst, 4);
6418     orq(dst, rtmp1);
6419   }
6420   bswapq(dst);
6421 }
6422 
6423 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6424   Label done;
6425   Label neg_divisor_fastpath;
6426   cmpq(divisor, 0);
6427   jccb(Assembler::less, neg_divisor_fastpath);
6428   xorl(rdx, rdx);
6429   divq(divisor);
6430   jmpb(done);
6431   bind(neg_divisor_fastpath);
6432   // Fastpath for divisor < 0:
6433   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6434   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6435   movq(rdx, rax);
6436   subq(rdx, divisor);
6437   if (VM_Version::supports_bmi1()) {
6438     andnq(rax, rdx, rax);
6439   } else {
6440     notq(rdx);
6441     andq(rax, rdx);
6442   }
6443   shrq(rax, 63);
6444   bind(done);
6445 }
6446 
6447 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6448   Label done;
6449   Label neg_divisor_fastpath;
6450   cmpq(divisor, 0);
6451   jccb(Assembler::less, neg_divisor_fastpath);
6452   xorq(rdx, rdx);
6453   divq(divisor);
6454   jmp(done);
6455   bind(neg_divisor_fastpath);
6456   // Fastpath when divisor < 0:
6457   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6458   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6459   movq(rdx, rax);
6460   subq(rax, divisor);
6461   if (VM_Version::supports_bmi1()) {
6462     andnq(rax, rax, rdx);
6463   } else {
6464     notq(rax);
6465     andq(rax, rdx);
6466   }
6467   sarq(rax, 63);
6468   andq(rax, divisor);
6469   subq(rdx, rax);
6470   bind(done);
6471 }
6472 
6473 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6474   Label done;
6475   Label neg_divisor_fastpath;
6476   cmpq(divisor, 0);
6477   jccb(Assembler::less, neg_divisor_fastpath);
6478   xorq(rdx, rdx);
6479   divq(divisor);
6480   jmp(done);
6481   bind(neg_divisor_fastpath);
6482   // Fastpath for divisor < 0:
6483   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6484   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6485   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6486   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6487   movq(rdx, rax);
6488   subq(rax, divisor);
6489   if (VM_Version::supports_bmi1()) {
6490     andnq(rax, rax, rdx);
6491   } else {
6492     notq(rax);
6493     andq(rax, rdx);
6494   }
6495   movq(tmp, rax);
6496   shrq(rax, 63); // quotient
6497   sarq(tmp, 63);
6498   andq(tmp, divisor);
6499   subq(rdx, tmp); // remainder
6500   bind(done);
6501 }
6502 
6503 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6504                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6505                                         int vlen_enc) {
6506   assert(VM_Version::supports_avx512bw(), "");
6507   // Byte shuffles are inlane operations and indices are determined using
6508   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6509   // normalized to index range 0-15. This makes sure that all the multiples
6510   // of an index value are placed at same relative position in 128 bit
6511   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6512   // will be 16th element in their respective 128 bit lanes.
6513   movl(rtmp, 16);
6514   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6515 
6516   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6517   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6518   // original shuffle indices and move the shuffled lanes corresponding to true
6519   // mask to destination vector.
6520   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6521   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6522   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6523 
6524   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6525   // and broadcasting second 128 bit lane.
6526   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6527   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6528   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6529   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6530   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6531 
6532   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6533   // and broadcasting third 128 bit lane.
6534   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6535   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6536   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6537   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6538   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6539 
6540   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6541   // and broadcasting third 128 bit lane.
6542   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6543   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6544   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6545   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6546   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6547 }
6548 
6549 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6550                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6551   if (vlen_enc == AVX_128bit) {
6552     vpermilps(dst, src, shuffle, vlen_enc);
6553   } else if (bt == T_INT) {
6554     vpermd(dst, shuffle, src, vlen_enc);
6555   } else {
6556     assert(bt == T_FLOAT, "");
6557     vpermps(dst, shuffle, src, vlen_enc);
6558   }
6559 }
6560 
6561 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6562   switch(opcode) {
6563     case Op_AddHF: vaddsh(dst, src1, src2); break;
6564     case Op_SubHF: vsubsh(dst, src1, src2); break;
6565     case Op_MulHF: vmulsh(dst, src1, src2); break;
6566     case Op_DivHF: vdivsh(dst, src1, src2); break;
6567     default: assert(false, "%s", NodeClassNames[opcode]); break;
6568   }
6569 }
6570 
6571 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6572   switch(elem_bt) {
6573     case T_BYTE:
6574       if (ideal_opc == Op_SaturatingAddV) {
6575         vpaddsb(dst, src1, src2, vlen_enc);
6576       } else {
6577         assert(ideal_opc == Op_SaturatingSubV, "");
6578         vpsubsb(dst, src1, src2, vlen_enc);
6579       }
6580       break;
6581     case T_SHORT:
6582       if (ideal_opc == Op_SaturatingAddV) {
6583         vpaddsw(dst, src1, src2, vlen_enc);
6584       } else {
6585         assert(ideal_opc == Op_SaturatingSubV, "");
6586         vpsubsw(dst, src1, src2, vlen_enc);
6587       }
6588       break;
6589     default:
6590       fatal("Unsupported type %s", type2name(elem_bt));
6591       break;
6592   }
6593 }
6594 
6595 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6596   switch(elem_bt) {
6597     case T_BYTE:
6598       if (ideal_opc == Op_SaturatingAddV) {
6599         vpaddusb(dst, src1, src2, vlen_enc);
6600       } else {
6601         assert(ideal_opc == Op_SaturatingSubV, "");
6602         vpsubusb(dst, src1, src2, vlen_enc);
6603       }
6604       break;
6605     case T_SHORT:
6606       if (ideal_opc == Op_SaturatingAddV) {
6607         vpaddusw(dst, src1, src2, vlen_enc);
6608       } else {
6609         assert(ideal_opc == Op_SaturatingSubV, "");
6610         vpsubusw(dst, src1, src2, vlen_enc);
6611       }
6612       break;
6613     default:
6614       fatal("Unsupported type %s", type2name(elem_bt));
6615       break;
6616   }
6617 }
6618 
6619 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6620                                                               XMMRegister src2, KRegister ktmp, int vlen_enc) {
6621   // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6622   // overflow_mask = Inp1 <u Inp2
6623   evpcmpu(elem_bt, ktmp,  src2, src1, Assembler::lt, vlen_enc);
6624   // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6625   evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6626 }
6627 
6628 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6629                                                               XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6630   // Emulate unsigned comparison using signed comparison
6631   // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6632   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6633   vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6634   vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6635 
6636   vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6637 
6638   // Res = INP1 - INP2 (non-commutative and non-associative)
6639   vpsub(elem_bt, dst, src1, src2, vlen_enc);
6640   // Res = Mask ? Zero : Res
6641   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6642   vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6643 }
6644 
6645 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6646                                                                XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6647   // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6648   // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6649   // Res = Signed Add INP1, INP2
6650   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6651   // T1 = SRC1 | SRC2
6652   vpor(xtmp1, src1, src2, vlen_enc);
6653   // Max_Unsigned = -1
6654   vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6655   // Unsigned compare:  Mask = Res <u T1
6656   evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6657   // res  = Mask ? Max_Unsigned : Res
6658   evpblend(elem_bt, dst, ktmp,  dst, xtmp2, true, vlen_enc);
6659 }
6660 
6661 //
6662 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6663 // unsigned addition operation.
6664 //    overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6665 //
6666 // We empirically determined its semantic equivalence to following reduced expression
6667 //    overflow_mask =  (a + b) <u (a | b)
6668 //
6669 // and also verified it though Alive2 solver.
6670 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6671 //
6672 
6673 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6674                                                               XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6675   // Res = Signed Add INP1, INP2
6676   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6677   // Compute T1 = INP1 | INP2
6678   vpor(xtmp3, src1, src2, vlen_enc);
6679   // T1 = Minimum signed value.
6680   vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6681   // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6682   vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6683   // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6684   vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6685   // Compute overflow detection mask = Res<1> <s T1
6686   if (elem_bt == T_INT) {
6687     vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6688   } else {
6689     assert(elem_bt == T_LONG, "");
6690     vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6691   }
6692   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6693 }
6694 
6695 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6696                                       int vlen_enc, bool xtmp2_hold_M1) {
6697   if (VM_Version::supports_avx512dq()) {
6698     evpmovq2m(ktmp, src, vlen_enc);
6699   } else {
6700     assert(VM_Version::supports_evex(), "");
6701     if (!xtmp2_hold_M1) {
6702       vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6703     }
6704     evpsraq(xtmp1, src, 63, vlen_enc);
6705     evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6706   }
6707 }
6708 
6709 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6710                                       int vlen_enc, bool xtmp2_hold_M1) {
6711   if (VM_Version::supports_avx512dq()) {
6712     evpmovd2m(ktmp, src, vlen_enc);
6713   } else {
6714     assert(VM_Version::supports_evex(), "");
6715     if (!xtmp2_hold_M1) {
6716       vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6717     }
6718     vpsrad(xtmp1, src, 31, vlen_enc);
6719     Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6720   }
6721 }
6722 
6723 
6724 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6725   if (elem_bt == T_LONG) {
6726     if (VM_Version::supports_evex()) {
6727       evpsraq(dst, src, 63, vlen_enc);
6728     } else {
6729       vpsrad(dst, src, 31, vlen_enc);
6730       vpshufd(dst, dst, 0xF5, vlen_enc);
6731     }
6732   } else {
6733     assert(elem_bt == T_INT, "");
6734     vpsrad(dst, src, 31, vlen_enc);
6735   }
6736 }
6737 
6738 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6739   if (compute_allones) {
6740     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6741       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6742     } else {
6743       vpcmpeqq(allones, allones, allones, vlen_enc);
6744     }
6745   }
6746   if (elem_bt == T_LONG) {
6747     vpsrlq(dst, allones, 1, vlen_enc);
6748   } else {
6749     assert(elem_bt == T_INT, "");
6750     vpsrld(dst, allones, 1, vlen_enc);
6751   }
6752 }
6753 
6754 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6755   if (compute_allones) {
6756     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6757       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6758     } else {
6759       vpcmpeqq(allones, allones, allones, vlen_enc);
6760     }
6761   }
6762   if (elem_bt == T_LONG) {
6763     vpsllq(dst, allones, 63, vlen_enc);
6764   } else {
6765     assert(elem_bt == T_INT, "");
6766     vpslld(dst, allones, 31, vlen_enc);
6767   }
6768 }
6769 
6770 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask,  XMMRegister src1, XMMRegister src2,
6771                                 Assembler::ComparisonPredicate cond, int vlen_enc) {
6772   switch(elem_bt) {
6773     case T_LONG:  evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6774     case T_INT:   evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6775     case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6776     case T_BYTE:  evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6777     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6778   }
6779 }
6780 
6781 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6782   switch(elem_bt) {
6783     case  T_LONG:  vpcmpgtq(dst, src1, src2, vlen_enc); break;
6784     case  T_INT:   vpcmpgtd(dst, src1, src2, vlen_enc); break;
6785     case  T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6786     case  T_BYTE:  vpcmpgtb(dst, src1, src2, vlen_enc); break;
6787     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6788   }
6789 }
6790 
6791 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6792                                            XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6793   if (elem_bt == T_LONG) {
6794     evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6795   } else {
6796     assert(elem_bt == T_INT, "");
6797     evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6798   }
6799 }
6800 
6801 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6802                                                          XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6803                                                          KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6804   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6805   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6806   // Overflow detection based on Hacker's delight section 2-13.
6807   if (ideal_opc == Op_SaturatingAddV) {
6808     // res = src1 + src2
6809     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6810     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6811     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6812     vpxor(xtmp1, dst, src1, vlen_enc);
6813     vpxor(xtmp2, dst, src2, vlen_enc);
6814     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6815   } else {
6816     assert(ideal_opc == Op_SaturatingSubV, "");
6817     // res = src1 - src2
6818     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6819     // Overflow occurs when both inputs have opposite polarity and
6820     // result polarity does not comply with first input polarity.
6821     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6822     vpxor(xtmp1, src1, src2, vlen_enc);
6823     vpxor(xtmp2, dst, src1, vlen_enc);
6824     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6825   }
6826 
6827   // Compute overflow detection mask.
6828   evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6829   // Note: xtmp1 hold -1 in all its lanes after above call.
6830 
6831   // Compute mask based on first input polarity.
6832   evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6833 
6834   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6835   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6836 
6837   // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6838   // set bits in first input polarity mask holds a min value.
6839   evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6840   // Blend destination lanes with saturated values using overflow detection mask.
6841   evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6842 }
6843 
6844 
6845 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6846                                                         XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6847                                                         XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6848   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6849   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6850   // Overflow detection based on Hacker's delight section 2-13.
6851   if (ideal_opc == Op_SaturatingAddV) {
6852     // res = src1 + src2
6853     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6854     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6855     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6856     vpxor(xtmp1, dst, src1, vlen_enc);
6857     vpxor(xtmp2, dst, src2, vlen_enc);
6858     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6859   } else {
6860     assert(ideal_opc == Op_SaturatingSubV, "");
6861     // res = src1 - src2
6862     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6863     // Overflow occurs when both inputs have opposite polarity and
6864     // result polarity does not comply with first input polarity.
6865     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6866     vpxor(xtmp1, src1, src2, vlen_enc);
6867     vpxor(xtmp2, dst, src1, vlen_enc);
6868     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6869   }
6870 
6871   // Sign-extend to compute overflow detection mask.
6872   vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
6873 
6874   vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
6875   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
6876   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6877 
6878   // Compose saturating min/max vector using first input polarity mask.
6879   vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
6880   vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
6881 
6882   // Blend result with saturating vector using overflow detection mask.
6883   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6884 }
6885 
6886 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6887   switch(elem_bt) {
6888     case T_BYTE:
6889       if (ideal_opc == Op_SaturatingAddV) {
6890         vpaddsb(dst, src1, src2, vlen_enc);
6891       } else {
6892         assert(ideal_opc == Op_SaturatingSubV, "");
6893         vpsubsb(dst, src1, src2, vlen_enc);
6894       }
6895       break;
6896     case T_SHORT:
6897       if (ideal_opc == Op_SaturatingAddV) {
6898         vpaddsw(dst, src1, src2, vlen_enc);
6899       } else {
6900         assert(ideal_opc == Op_SaturatingSubV, "");
6901         vpsubsw(dst, src1, src2, vlen_enc);
6902       }
6903       break;
6904     default:
6905       fatal("Unsupported type %s", type2name(elem_bt));
6906       break;
6907   }
6908 }
6909 
6910 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6911   switch(elem_bt) {
6912     case T_BYTE:
6913       if (ideal_opc == Op_SaturatingAddV) {
6914         vpaddusb(dst, src1, src2, vlen_enc);
6915       } else {
6916         assert(ideal_opc == Op_SaturatingSubV, "");
6917         vpsubusb(dst, src1, src2, vlen_enc);
6918       }
6919       break;
6920     case T_SHORT:
6921       if (ideal_opc == Op_SaturatingAddV) {
6922         vpaddusw(dst, src1, src2, vlen_enc);
6923       } else {
6924         assert(ideal_opc == Op_SaturatingSubV, "");
6925         vpsubusw(dst, src1, src2, vlen_enc);
6926       }
6927       break;
6928     default:
6929       fatal("Unsupported type %s", type2name(elem_bt));
6930       break;
6931   }
6932 }
6933 
6934 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6935                                                      XMMRegister src2, int vlen_enc) {
6936   switch(elem_bt) {
6937     case T_BYTE:
6938       evpermi2b(dst, src1, src2, vlen_enc);
6939       break;
6940     case T_SHORT:
6941       evpermi2w(dst, src1, src2, vlen_enc);
6942       break;
6943     case T_INT:
6944       evpermi2d(dst, src1, src2, vlen_enc);
6945       break;
6946     case T_LONG:
6947       evpermi2q(dst, src1, src2, vlen_enc);
6948       break;
6949     case T_FLOAT:
6950       evpermi2ps(dst, src1, src2, vlen_enc);
6951       break;
6952     case T_DOUBLE:
6953       evpermi2pd(dst, src1, src2, vlen_enc);
6954       break;
6955     default:
6956       fatal("Unsupported type %s", type2name(elem_bt));
6957       break;
6958   }
6959 }
6960 
6961 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
6962   if (is_unsigned) {
6963     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6964   } else {
6965     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6966   }
6967 }
6968 
6969 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
6970   if (is_unsigned) {
6971     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6972   } else {
6973     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6974   }
6975 }
6976 
6977 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6978   switch(opcode) {
6979     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
6980     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
6981     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
6982     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
6983     default: assert(false, "%s", NodeClassNames[opcode]); break;
6984   }
6985 }
6986 
6987 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6988   switch(opcode) {
6989     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
6990     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
6991     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
6992     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
6993     default: assert(false, "%s", NodeClassNames[opcode]); break;
6994   }
6995 }
6996 
6997 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6998                                             KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) {
6999   vector_max_min_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit);
7000 }
7001 
7002 void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7003                                             KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
7004   if (opcode == Op_MaxVHF || opcode == Op_MaxHF) {
7005     // Move sign bits of src2 to mask register.
7006     evpmovw2m(ktmp, src2, vlen_enc);
7007     // xtmp1 = src2 < 0 ? src2 : src1
7008     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7009     // xtmp2 = src2 < 0 ? ? src1 : src2
7010     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7011     // Idea behind above swapping is to make seconds source operand a +ve value.
7012     // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in
7013     // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction,
7014     // the second source operand, either a NaN or a valid floating-point value, is returned
7015     // dst = max(xtmp1, xtmp2)
7016     evmaxph(dst, xtmp1, xtmp2, vlen_enc);
7017     // isNaN = is_unordered_quiet(xtmp1)
7018     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7019     // Final result is same as first source if its a NaN value,
7020     // in case second operand holds a NaN value then as per above semantics
7021     // result is same as second operand.
7022     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7023   } else {
7024     assert(opcode == Op_MinVHF || opcode == Op_MinHF, "");
7025     // Move sign bits of src1 to mask register.
7026     evpmovw2m(ktmp, src1, vlen_enc);
7027     // xtmp1 = src1 < 0 ? src2 : src1
7028     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7029     // xtmp2 = src1 < 0 ? src1 : src2
7030     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7031     // Idea behind above swapping is to make seconds source operand a -ve value.
7032     // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in
7033     // the second source operand is returned.
7034     // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN
7035     // or a valid floating-point value, is written to the result.
7036     // dst = min(xtmp1, xtmp2)
7037     evminph(dst, xtmp1, xtmp2, vlen_enc);
7038     // isNaN = is_unordered_quiet(xtmp1)
7039     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7040     // Final result is same as first source if its a NaN value,
7041     // in case second operand holds a NaN value then as per above semantics
7042     // result is same as second operand.
7043     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7044   }
7045 }