Old src/hotspot/cpu/x86/c2_MacroAssembler

   1 /*
   2  * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "gc/shared/barrierSet.hpp"
  28 #include "gc/shared/barrierSetAssembler.hpp"
  29 #include "oops/methodData.hpp"
  30 #include "opto/c2_MacroAssembler.hpp"
  31 #include "opto/intrinsicnode.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/opcodes.hpp"
  34 #include "opto/subnode.hpp"
  35 #include "runtime/globals.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 #include "utilities/checkedCast.hpp"
  39 #include "utilities/globalDefinitions.hpp"
  40 #include "utilities/powerOfTwo.hpp"
  41 #include "utilities/sizes.hpp"
  42 
  43 #ifdef PRODUCT
  44 #define BLOCK_COMMENT(str) /* nothing */
  45 #define STOP(error) stop(error)
  46 #else
  47 #define BLOCK_COMMENT(str) block_comment(str)
  48 #define STOP(error) block_comment(error); stop(error)
  49 #endif
  50 
  51 // C2 compiled method's prolog code.
  52 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
  53   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  54 
  55   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  56   // Remove word for return addr
  57   framesize -= wordSize;
  58   stack_bang_size -= wordSize;
  59 
  60   // Calls to C2R adapters often do not accept exceptional returns.
  61   // We require that their callers must bang for them.  But be careful, because
  62   // some VM calls (such as call site linkage) can use several kilobytes of
  63   // stack.  But the stack safety zone should account for that.
  64   // See bugs 4446381, 4468289, 4497237.
  65   if (stack_bang_size > 0) {
  66     generate_stack_overflow_check(stack_bang_size);
  67 
  68     // We always push rbp, so that on return to interpreter rbp, will be
  69     // restored correctly and we can correct the stack.
  70     push(rbp);
  71     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  72     if (PreserveFramePointer) {
  73       mov(rbp, rsp);
  74     }
  75     // Remove word for ebp
  76     framesize -= wordSize;
  77 
  78     // Create frame
  79     if (framesize) {
  80       subptr(rsp, framesize);
  81     }
  82   } else {
  83     subptr(rsp, framesize);
  84 
  85     // Save RBP register now.
  86     framesize -= wordSize;
  87     movptr(Address(rsp, framesize), rbp);
  88     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  89     if (PreserveFramePointer) {
  90       movptr(rbp, rsp);
  91       if (framesize > 0) {
  92         addptr(rbp, framesize);
  93       }
  94     }
  95   }
  96 
  97   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
  98     framesize -= wordSize;
  99     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 100   }
 101 
 102 #ifdef ASSERT
 103   if (VerifyStackAtCalls) {
 104     Label L;
 105     push(rax);
 106     mov(rax, rsp);
 107     andptr(rax, StackAlignmentInBytes-1);
 108     cmpptr(rax, StackAlignmentInBytes-wordSize);
 109     pop(rax);
 110     jcc(Assembler::equal, L);
 111     STOP("Stack is not properly aligned!");
 112     bind(L);
 113   }
 114 #endif
 115 
 116   if (!is_stub) {
 117     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 118     // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 119     Label dummy_slow_path;
 120     Label dummy_continuation;
 121     Label* slow_path = &dummy_slow_path;
 122     Label* continuation = &dummy_continuation;
 123     if (!Compile::current()->output()->in_scratch_emit_size()) {
 124       // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 125       C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 126       Compile::current()->output()->add_stub(stub);
 127       slow_path = &stub->entry();
 128       continuation = &stub->continuation();
 129     }
 130     bs->nmethod_entry_barrier(this, slow_path, continuation);
 131   }
 132 }
 133 
 134 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 135   switch (vlen_in_bytes) {
 136     case  4: // fall-through
 137     case  8: // fall-through
 138     case 16: return Assembler::AVX_128bit;
 139     case 32: return Assembler::AVX_256bit;
 140     case 64: return Assembler::AVX_512bit;
 141 
 142     default: {
 143       ShouldNotReachHere();
 144       return Assembler::AVX_NoVec;
 145     }
 146   }
 147 }
 148 
 149 // fast_lock and fast_unlock used by C2
 150 
 151 // Because the transitions from emitted code to the runtime
 152 // monitorenter/exit helper stubs are so slow it's critical that
 153 // we inline both the stack-locking fast path and the inflated fast path.
 154 //
 155 // See also: cmpFastLock and cmpFastUnlock.
 156 //
 157 // What follows is a specialized inline transliteration of the code
 158 // in enter() and exit(). If we're concerned about I$ bloat another
 159 // option would be to emit TrySlowEnter and TrySlowExit methods
 160 // at startup-time.  These methods would accept arguments as
 161 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 162 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 163 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 164 // In practice, however, the # of lock sites is bounded and is usually small.
 165 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 166 // if the processor uses simple bimodal branch predictors keyed by EIP
 167 // Since the helper routines would be called from multiple synchronization
 168 // sites.
 169 //
 170 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 171 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 172 // to those specialized methods.  That'd give us a mostly platform-independent
 173 // implementation that the JITs could optimize and inline at their pleasure.
 174 // Done correctly, the only time we'd need to cross to native could would be
 175 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 176 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 177 // (b) explicit barriers or fence operations.
 178 //
 179 // TODO:
 180 //
 181 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 182 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 183 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 184 //    the lock operators would typically be faster than reifying Self.
 185 //
 186 // *  Ideally I'd define the primitives as:
 187 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 188 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 189 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 190 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 191 //    Furthermore the register assignments are overconstrained, possibly resulting in
 192 //    sub-optimal code near the synchronization site.
 193 //
 194 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 195 //    Alternately, use a better sp-proximity test.
 196 //
 197 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 198 //    Either one is sufficient to uniquely identify a thread.
 199 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 200 //
 201 // *  Intrinsify notify() and notifyAll() for the common cases where the
 202 //    object is locked by the calling thread but the waitlist is empty.
 203 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 204 //
 205 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 206 //    But beware of excessive branch density on AMD Opterons.
 207 //
 208 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 209 //    or failure of the fast path.  If the fast path fails then we pass
 210 //    control to the slow path, typically in C.  In fast_lock and
 211 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 212 //    will emit a conditional branch immediately after the node.
 213 //    So we have branches to branches and lots of ICC.ZF games.
 214 //    Instead, it might be better to have C2 pass a "FailureLabel"
 215 //    into fast_lock and fast_unlock.  In the case of success, control
 216 //    will drop through the node.  ICC.ZF is undefined at exit.
 217 //    In the case of failure, the node will branch directly to the
 218 //    FailureLabel
 219 
 220 
 221 // obj: object to lock
 222 // box: on-stack box address -- KILLED
 223 // rax: tmp -- KILLED
 224 // t  : tmp -- KILLED
 225 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
 226                                               Register t, Register thread) {
 227   assert(rax_reg == rax, "Used for CAS");
 228   assert_different_registers(obj, box, rax_reg, t, thread);
 229 
 230   // Handle inflated monitor.
 231   Label inflated;
 232   // Finish fast lock successfully. ZF value is irrelevant.
 233   Label locked;
 234   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 235   Label slow_path;
 236 
 237   if (UseObjectMonitorTable) {
 238     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 239     movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
 240   }
 241 
 242   if (DiagnoseSyncOnValueBasedClasses != 0) {
 243     load_klass(rax_reg, obj, t);
 244     testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 245     jcc(Assembler::notZero, slow_path);
 246   }
 247 
 248   const Register mark = t;
 249 
 250   { // Lightweight Lock
 251 
 252     Label push;
 253 
 254     const Register top = UseObjectMonitorTable ? rax_reg : box;
 255 
 256     // Load the mark.
 257     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 258 
 259     // Prefetch top.
 260     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 261 
 262     // Check for monitor (0b10).
 263     testptr(mark, markWord::monitor_value);
 264     jcc(Assembler::notZero, inflated);
 265 
 266     // Check if lock-stack is full.
 267     cmpl(top, LockStack::end_offset() - 1);
 268     jcc(Assembler::greater, slow_path);
 269 
 270     // Check if recursive.
 271     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 272     jccb(Assembler::equal, push);
 273 
 274     // Try to lock. Transition lock bits 0b01 => 0b00
 275     movptr(rax_reg, mark);
 276     orptr(rax_reg, markWord::unlocked_value);
 277     andptr(mark, ~(int32_t)markWord::unlocked_value);
 278     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 279     jcc(Assembler::notEqual, slow_path);
 280 
 281     if (UseObjectMonitorTable) {
 282       // Need to reload top, clobbered by CAS.
 283       movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 284     }
 285     bind(push);
 286     // After successful lock, push object on lock-stack.
 287     movptr(Address(thread, top), obj);
 288     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 289     jmpb(locked);
 290   }
 291 
 292   { // Handle inflated monitor.
 293     bind(inflated);
 294 
 295     const Register monitor = t;
 296 
 297     if (!UseObjectMonitorTable) {
 298       assert(mark == monitor, "should be the same here");
 299     } else {
 300       // Uses ObjectMonitorTable.  Look for the monitor in the om_cache.
 301       // Fetch ObjectMonitor* from the cache or take the slow-path.
 302       Label monitor_found;
 303 
 304       // Load cache address
 305       lea(t, Address(thread, JavaThread::om_cache_oops_offset()));
 306 
 307       const int num_unrolled = 2;
 308       for (int i = 0; i < num_unrolled; i++) {
 309         cmpptr(obj, Address(t));
 310         jccb(Assembler::equal, monitor_found);
 311         increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 312       }
 313 
 314       Label loop;
 315 
 316       // Search for obj in cache.
 317       bind(loop);
 318 
 319       // Check for match.
 320       cmpptr(obj, Address(t));
 321       jccb(Assembler::equal, monitor_found);
 322 
 323       // Search until null encountered, guaranteed _null_sentinel at end.
 324       cmpptr(Address(t), 1);
 325       jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0
 326       increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 327       jmpb(loop);
 328 
 329       // Cache hit.
 330       bind(monitor_found);
 331       movptr(monitor, Address(t, OMCache::oop_to_monitor_difference()));
 332     }
 333     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 334     const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 335     const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
 336 
 337     Label monitor_locked;
 338     // Lock the monitor.
 339 
 340     if (UseObjectMonitorTable) {
 341       // Cache the monitor for unlock before trashing box. On failure to acquire
 342       // the lock, the slow path will reset the entry accordingly (see CacheSetter).
 343       movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
 344     }
 345 
 346     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 347     xorptr(rax_reg, rax_reg);
 348     movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
 349     lock(); cmpxchgptr(box, owner_address);
 350     jccb(Assembler::equal, monitor_locked);
 351 
 352     // Check if recursive.
 353     cmpptr(box, rax_reg);
 354     jccb(Assembler::notEqual, slow_path);
 355 
 356     // Recursive.
 357     increment(recursions_address);
 358 
 359     bind(monitor_locked);
 360   }
 361 
 362   bind(locked);
 363   // Set ZF = 1
 364   xorl(rax_reg, rax_reg);
 365 
 366 #ifdef ASSERT
 367   // Check that locked label is reached with ZF set.
 368   Label zf_correct;
 369   Label zf_bad_zero;
 370   jcc(Assembler::zero, zf_correct);
 371   jmp(zf_bad_zero);
 372 #endif
 373 
 374   bind(slow_path);
 375 #ifdef ASSERT
 376   // Check that slow_path label is reached with ZF not set.
 377   jcc(Assembler::notZero, zf_correct);
 378   stop("Fast Lock ZF != 0");
 379   bind(zf_bad_zero);
 380   stop("Fast Lock ZF != 1");
 381   bind(zf_correct);
 382 #endif
 383   // C2 uses the value of ZF to determine the continuation.
 384 }
 385 
 386 // obj: object to lock
 387 // rax: tmp -- KILLED
 388 // t  : tmp - cannot be obj nor rax -- KILLED
 389 //
 390 // Some commentary on balanced locking:
 391 //
 392 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 393 // Methods that don't have provably balanced locking are forced to run in the
 394 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 395 // The interpreter provides two properties:
 396 // I1:  At return-time the interpreter automatically and quietly unlocks any
 397 //      objects acquired in the current activation (frame).  Recall that the
 398 //      interpreter maintains an on-stack list of locks currently held by
 399 //      a frame.
 400 // I2:  If a method attempts to unlock an object that is not held by the
 401 //      frame the interpreter throws IMSX.
 402 //
 403 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 404 // B() doesn't have provably balanced locking so it runs in the interpreter.
 405 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 406 // is still locked by A().
 407 //
 408 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface
 409 // Specification" states that an object locked by JNI's MonitorEnter should not be
 410 // unlocked by "normal" java-level locking and vice-versa.  The specification doesn't
 411 // specify what will occur if a program engages in such mixed-mode locking, however.
 412 // Arguably given that the spec legislates the JNI case as undefined our implementation
 413 // could reasonably *avoid* checking owner in fast_unlock().
 414 // In the interest of performance we elide m->Owner==Self check in unlock.
 415 // A perfectly viable alternative is to elide the owner check except when
 416 // Xcheck:jni is enabled.
 417 
 418 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) {
 419   assert(reg_rax == rax, "Used for CAS");
 420   assert_different_registers(obj, reg_rax, t);
 421 
 422   // Handle inflated monitor.
 423   Label inflated, inflated_check_lock_stack;
 424   // Finish fast unlock successfully.  MUST jump with ZF == 1
 425   Label unlocked, slow_path;
 426 
 427   const Register mark = t;
 428   const Register monitor = t;
 429   const Register top = UseObjectMonitorTable ? t : reg_rax;
 430   const Register box = reg_rax;
 431 
 432   Label dummy;
 433   C2FastUnlockLightweightStub* stub = nullptr;
 434 
 435   if (!Compile::current()->output()->in_scratch_emit_size()) {
 436     stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread);
 437     Compile::current()->output()->add_stub(stub);
 438   }
 439 
 440   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 441 
 442   { // Lightweight Unlock
 443 
 444     // Load top.
 445     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 446 
 447     if (!UseObjectMonitorTable) {
 448       // Prefetch mark.
 449       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 450     }
 451 
 452     // Check if obj is top of lock-stack.
 453     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 454     // Top of lock stack was not obj. Must be monitor.
 455     jcc(Assembler::notEqual, inflated_check_lock_stack);
 456 
 457     // Pop lock-stack.
 458     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 459     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 460 
 461     // Check if recursive.
 462     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 463     jcc(Assembler::equal, unlocked);
 464 
 465     // We elide the monitor check, let the CAS fail instead.
 466 
 467     if (UseObjectMonitorTable) {
 468       // Load mark.
 469       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 470     }
 471 
 472     // Try to unlock. Transition lock bits 0b00 => 0b01
 473     movptr(reg_rax, mark);
 474     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
 475     orptr(mark, markWord::unlocked_value);
 476     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 477     jcc(Assembler::notEqual, push_and_slow_path);
 478     jmp(unlocked);
 479   }
 480 
 481 
 482   { // Handle inflated monitor.
 483     bind(inflated_check_lock_stack);
 484 #ifdef ASSERT
 485     Label check_done;
 486     subl(top, oopSize);
 487     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
 488     jcc(Assembler::below, check_done);
 489     cmpptr(obj, Address(thread, top));
 490     jccb(Assembler::notEqual, inflated_check_lock_stack);
 491     stop("Fast Unlock lock on stack");
 492     bind(check_done);
 493     if (UseObjectMonitorTable) {
 494       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 495     }
 496     testptr(mark, markWord::monitor_value);
 497     jccb(Assembler::notZero, inflated);
 498     stop("Fast Unlock not monitor");
 499 #endif
 500 
 501     bind(inflated);
 502 
 503     if (!UseObjectMonitorTable) {
 504       assert(mark == monitor, "should be the same here");
 505     } else {
 506       // Uses ObjectMonitorTable.  Look for the monitor in our BasicLock on the stack.
 507       movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 508       // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
 509       cmpptr(monitor, alignof(ObjectMonitor*));
 510       jcc(Assembler::below, slow_path);
 511     }
 512     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 513     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
 514     const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
 515     const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag};
 516     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
 517 
 518     Label recursive;
 519 
 520     // Check if recursive.
 521     cmpptr(recursions_address, 0);
 522     jccb(Assembler::notZero, recursive);
 523 
 524     // Set owner to null.
 525     // Release to satisfy the JMM
 526     movptr(owner_address, NULL_WORD);
 527     // We need a full fence after clearing owner to avoid stranding.
 528     // StoreLoad achieves this.
 529     membar(StoreLoad);
 530 
 531     // Check if the entry_list is empty.
 532     cmpptr(entry_list_address, NULL_WORD);
 533     jccb(Assembler::zero, unlocked);    // If so we are done.
 534 
 535     // Check if there is a successor.
 536     cmpptr(succ_address, NULL_WORD);
 537     jccb(Assembler::notZero, unlocked); // If so we are done.
 538 
 539     // Save the monitor pointer in the current thread, so we can try to
 540     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 541     if (!UseObjectMonitorTable) {
 542       andptr(monitor, ~(int32_t)markWord::monitor_value);
 543     }
 544     movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
 545 
 546     orl(t, 1); // Fast Unlock ZF = 0
 547     jmpb(slow_path);
 548 
 549     // Recursive unlock.
 550     bind(recursive);
 551     decrement(recursions_address);
 552   }
 553 
 554   bind(unlocked);
 555   xorl(t, t); // Fast Unlock ZF = 1
 556 
 557 #ifdef ASSERT
 558   // Check that unlocked label is reached with ZF set.
 559   Label zf_correct;
 560   Label zf_bad_zero;
 561   jcc(Assembler::zero, zf_correct);
 562   jmp(zf_bad_zero);
 563 #endif
 564 
 565   bind(slow_path);
 566   if (stub != nullptr) {
 567     bind(stub->slow_path_continuation());
 568   }
 569 #ifdef ASSERT
 570   // Check that stub->continuation() label is reached with ZF not set.
 571   jcc(Assembler::notZero, zf_correct);
 572   stop("Fast Unlock ZF != 0");
 573   bind(zf_bad_zero);
 574   stop("Fast Unlock ZF != 1");
 575   bind(zf_correct);
 576 #endif
 577   // C2 uses the value of ZF to determine the continuation.
 578 }
 579 
 580 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
 581   fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
 582 }
 583 
 584 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) {
 585   const int framesize = Compile::current()->output()->frame_size_in_bytes();
 586   masm->movptr(dst, rsp);
 587   if (framesize > 2 * wordSize) {
 588     masm->addptr(dst, framesize - 2 * wordSize);
 589   }
 590 }
 591 
 592 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
 593   if (PreserveFramePointer) {
 594     // frame pointer is valid
 595 #ifdef ASSERT
 596     // Verify frame pointer value in rbp.
 597     reconstruct_frame_pointer_helper(this, rtmp);
 598     Label L_success;
 599     cmpq(rbp, rtmp);
 600     jccb(Assembler::equal, L_success);
 601     STOP("frame pointer mismatch");
 602     bind(L_success);
 603 #endif // ASSERT
 604   } else {
 605     reconstruct_frame_pointer_helper(this, rbp);
 606   }
 607 }
 608 
 609 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) {
 610   jint lo = t->_lo;
 611   jint hi = t->_hi;
 612   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi);
 613   if (t == TypeInt::INT) {
 614     return;
 615   }
 616 
 617   BLOCK_COMMENT("CastII {");
 618   Label fail;
 619   Label succeed;
 620   if (hi == max_jint) {
 621     cmpl(val, lo);
 622     jccb(Assembler::greaterEqual, succeed);
 623   } else {
 624     if (lo != min_jint) {
 625       cmpl(val, lo);
 626       jccb(Assembler::less, fail);
 627     }
 628     cmpl(val, hi);
 629     jccb(Assembler::lessEqual, succeed);
 630   }
 631 
 632   bind(fail);
 633   movl(c_rarg0, idx);
 634   movl(c_rarg1, val);
 635   movl(c_rarg2, lo);
 636   movl(c_rarg3, hi);
 637   reconstruct_frame_pointer(rscratch1);
 638   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range)));
 639   hlt();
 640   bind(succeed);
 641   BLOCK_COMMENT("} // CastII");
 642 }
 643 
 644 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
 645   fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
 646 }
 647 
 648 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) {
 649   jlong lo = t->_lo;
 650   jlong hi = t->_hi;
 651   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi);
 652   if (t == TypeLong::LONG) {
 653     return;
 654   }
 655 
 656   BLOCK_COMMENT("CastLL {");
 657   Label fail;
 658   Label succeed;
 659 
 660   auto cmp_val = [&](jlong bound) {
 661     if (is_simm32(bound)) {
 662       cmpq(val, checked_cast<int>(bound));
 663     } else {
 664       mov64(tmp, bound);
 665       cmpq(val, tmp);
 666     }
 667   };
 668 
 669   if (hi == max_jlong) {
 670     cmp_val(lo);
 671     jccb(Assembler::greaterEqual, succeed);
 672   } else {
 673     if (lo != min_jlong) {
 674       cmp_val(lo);
 675       jccb(Assembler::less, fail);
 676     }
 677     cmp_val(hi);
 678     jccb(Assembler::lessEqual, succeed);
 679   }
 680 
 681   bind(fail);
 682   movl(c_rarg0, idx);
 683   movq(c_rarg1, val);
 684   mov64(c_rarg2, lo);
 685   mov64(c_rarg3, hi);
 686   reconstruct_frame_pointer(rscratch1);
 687   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range)));
 688   hlt();
 689   bind(succeed);
 690   BLOCK_COMMENT("} // CastLL");
 691 }
 692 
 693 //-------------------------------------------------------------------------------------------
 694 // Generic instructions support for use in .ad files C2 code generation
 695 
 696 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 697   if (dst != src) {
 698     movdqu(dst, src);
 699   }
 700   if (opcode == Op_AbsVD) {
 701     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 702   } else {
 703     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 704     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 705   }
 706 }
 707 
 708 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 709   if (opcode == Op_AbsVD) {
 710     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 711   } else {
 712     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 713     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 714   }
 715 }
 716 
 717 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 718   if (dst != src) {
 719     movdqu(dst, src);
 720   }
 721   if (opcode == Op_AbsVF) {
 722     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 723   } else {
 724     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 725     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 726   }
 727 }
 728 
 729 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 730   if (opcode == Op_AbsVF) {
 731     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 732   } else {
 733     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 734     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 735   }
 736 }
 737 
 738 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 739   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 740   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 741 
 742   if (opcode == Op_MinV) {
 743     if (elem_bt == T_BYTE) {
 744       pminsb(dst, src);
 745     } else if (elem_bt == T_SHORT) {
 746       pminsw(dst, src);
 747     } else if (elem_bt == T_INT) {
 748       pminsd(dst, src);
 749     } else {
 750       assert(elem_bt == T_LONG, "required");
 751       assert(tmp == xmm0, "required");
 752       assert_different_registers(dst, src, tmp);
 753       movdqu(xmm0, dst);
 754       pcmpgtq(xmm0, src);
 755       blendvpd(dst, src);  // xmm0 as mask
 756     }
 757   } else { // opcode == Op_MaxV
 758     if (elem_bt == T_BYTE) {
 759       pmaxsb(dst, src);
 760     } else if (elem_bt == T_SHORT) {
 761       pmaxsw(dst, src);
 762     } else if (elem_bt == T_INT) {
 763       pmaxsd(dst, src);
 764     } else {
 765       assert(elem_bt == T_LONG, "required");
 766       assert(tmp == xmm0, "required");
 767       assert_different_registers(dst, src, tmp);
 768       movdqu(xmm0, src);
 769       pcmpgtq(xmm0, dst);
 770       blendvpd(dst, src);  // xmm0 as mask
 771     }
 772   }
 773 }
 774 
 775 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 776                                   XMMRegister src1, Address src2, int vlen_enc) {
 777   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 778   if (opcode == Op_UMinV) {
 779     switch(elem_bt) {
 780       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 781       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 782       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 783       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 784       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 785     }
 786   } else {
 787     assert(opcode == Op_UMaxV, "required");
 788     switch(elem_bt) {
 789       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 790       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 791       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 792       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 793       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 794     }
 795   }
 796 }
 797 
 798 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
 799   // For optimality, leverage a full vector width of 512 bits
 800   // for operations over smaller vector sizes on AVX512 targets.
 801   if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 802     if (opcode == Op_UMaxV) {
 803       evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 804     } else {
 805       assert(opcode == Op_UMinV, "required");
 806       evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 807     }
 808   } else {
 809     // T1 = -1
 810     vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
 811     // T1 = -1 << 63
 812     vpsllq(xtmp1, xtmp1, 63, vlen_enc);
 813     // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
 814     vpaddq(xtmp2, xtmp1, src2, vlen_enc);
 815     // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
 816     vpaddq(xtmp1, xtmp1, src1, vlen_enc);
 817     // Mask = T2 > T1
 818     vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
 819     if (opcode == Op_UMaxV) {
 820       // Res = Mask ? Src2 : Src1
 821       vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
 822     } else {
 823       // Res = Mask ? Src1 : Src2
 824       vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
 825     }
 826   }
 827 }
 828 
 829 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 830                                   XMMRegister src1, XMMRegister src2, int vlen_enc) {
 831   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 832   if (opcode == Op_UMinV) {
 833     switch(elem_bt) {
 834       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 835       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 836       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 837       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 838       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 839     }
 840   } else {
 841     assert(opcode == Op_UMaxV, "required");
 842     switch(elem_bt) {
 843       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 844       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 845       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 846       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 847       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 848     }
 849   }
 850 }
 851 
 852 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 853                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 854                                  int vlen_enc) {
 855   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 856 
 857   if (opcode == Op_MinV) {
 858     if (elem_bt == T_BYTE) {
 859       vpminsb(dst, src1, src2, vlen_enc);
 860     } else if (elem_bt == T_SHORT) {
 861       vpminsw(dst, src1, src2, vlen_enc);
 862     } else if (elem_bt == T_INT) {
 863       vpminsd(dst, src1, src2, vlen_enc);
 864     } else {
 865       assert(elem_bt == T_LONG, "required");
 866       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 867         vpminsq(dst, src1, src2, vlen_enc);
 868       } else {
 869         assert_different_registers(dst, src1, src2);
 870         vpcmpgtq(dst, src1, src2, vlen_enc);
 871         vblendvpd(dst, src1, src2, dst, vlen_enc);
 872       }
 873     }
 874   } else { // opcode == Op_MaxV
 875     if (elem_bt == T_BYTE) {
 876       vpmaxsb(dst, src1, src2, vlen_enc);
 877     } else if (elem_bt == T_SHORT) {
 878       vpmaxsw(dst, src1, src2, vlen_enc);
 879     } else if (elem_bt == T_INT) {
 880       vpmaxsd(dst, src1, src2, vlen_enc);
 881     } else {
 882       assert(elem_bt == T_LONG, "required");
 883       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 884         vpmaxsq(dst, src1, src2, vlen_enc);
 885       } else {
 886         assert_different_registers(dst, src1, src2);
 887         vpcmpgtq(dst, src1, src2, vlen_enc);
 888         vblendvpd(dst, src2, src1, dst, vlen_enc);
 889       }
 890     }
 891   }
 892 }
 893 
 894 // Float/Double min max
 895 
 896 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
 897                                    XMMRegister dst, XMMRegister a, XMMRegister b,
 898                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
 899                                    int vlen_enc) {
 900   assert(UseAVX > 0, "required");
 901   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 902          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 903   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 904   assert_different_registers(a, tmp, atmp, btmp);
 905   assert_different_registers(b, tmp, atmp, btmp);
 906 
 907   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 908   bool is_double_word = is_double_word_type(elem_bt);
 909 
 910   /* Note on 'non-obvious' assembly sequence:
 911    *
 912    * While there are vminps/vmaxps instructions, there are two important differences between hardware
 913    * and Java on how they handle floats:
 914    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
 915    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
 916    *
 917    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
 918    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
 919    *                (only useful when signs differ, noop otherwise)
 920    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
 921 
 922    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
 923    *   btmp = (b < +0.0) ? a : b
 924    *   atmp = (b < +0.0) ? b : a
 925    *   Tmp  = Max_Float(atmp , btmp)
 926    *   Res  = (atmp == NaN) ? atmp : Tmp
 927    */
 928 
 929   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
 930   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
 931   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
 932   XMMRegister mask;
 933 
 934   if (!is_double_word && is_min) {
 935     mask = a;
 936     vblend = &MacroAssembler::vblendvps;
 937     vmaxmin = &MacroAssembler::vminps;
 938     vcmp = &MacroAssembler::vcmpps;
 939   } else if (!is_double_word && !is_min) {
 940     mask = b;
 941     vblend = &MacroAssembler::vblendvps;
 942     vmaxmin = &MacroAssembler::vmaxps;
 943     vcmp = &MacroAssembler::vcmpps;
 944   } else if (is_double_word && is_min) {
 945     mask = a;
 946     vblend = &MacroAssembler::vblendvpd;
 947     vmaxmin = &MacroAssembler::vminpd;
 948     vcmp = &MacroAssembler::vcmppd;
 949   } else {
 950     assert(is_double_word && !is_min, "sanity");
 951     mask = b;
 952     vblend = &MacroAssembler::vblendvpd;
 953     vmaxmin = &MacroAssembler::vmaxpd;
 954     vcmp = &MacroAssembler::vcmppd;
 955   }
 956 
 957   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
 958   XMMRegister maxmin, scratch;
 959   if (dst == btmp) {
 960     maxmin = btmp;
 961     scratch = tmp;
 962   } else {
 963     maxmin = tmp;
 964     scratch = btmp;
 965   }
 966 
 967   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
 968   if (precompute_mask && !is_double_word) {
 969     vpsrad(tmp, mask, 32, vlen_enc);
 970     mask = tmp;
 971   } else if (precompute_mask && is_double_word) {
 972     vpxor(tmp, tmp, tmp, vlen_enc);
 973     vpcmpgtq(tmp, tmp, mask, vlen_enc);
 974     mask = tmp;
 975   }
 976 
 977   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
 978   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
 979   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
 980   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 981   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
 982 }
 983 
 984 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
 985                                     XMMRegister dst, XMMRegister a, XMMRegister b,
 986                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
 987                                     int vlen_enc) {
 988   assert(UseAVX > 2, "required");
 989   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 990          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 991   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 992   assert_different_registers(dst, a, atmp, btmp);
 993   assert_different_registers(dst, b, atmp, btmp);
 994 
 995   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 996   bool is_double_word = is_double_word_type(elem_bt);
 997   bool merge = true;
 998 
 999   if (!is_double_word && is_min) {
1000     evpmovd2m(ktmp, a, vlen_enc);
1001     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1002     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1003     vminps(dst, atmp, btmp, vlen_enc);
1004     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1005     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1006   } else if (!is_double_word && !is_min) {
1007     evpmovd2m(ktmp, b, vlen_enc);
1008     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1009     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1010     vmaxps(dst, atmp, btmp, vlen_enc);
1011     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1012     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1013   } else if (is_double_word && is_min) {
1014     evpmovq2m(ktmp, a, vlen_enc);
1015     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1016     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1017     vminpd(dst, atmp, btmp, vlen_enc);
1018     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1019     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1020   } else {
1021     assert(is_double_word && !is_min, "sanity");
1022     evpmovq2m(ktmp, b, vlen_enc);
1023     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1024     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1025     vmaxpd(dst, atmp, btmp, vlen_enc);
1026     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1027     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1028   }
1029 }
1030 
1031 void C2_MacroAssembler::vminmax_fp(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1032                                    XMMRegister src1, XMMRegister src2, int vlen_enc) {
1033   assert(opc == Op_MinV || opc == Op_MinReductionV ||
1034          opc == Op_MaxV || opc == Op_MaxReductionV, "sanity");
1035 
1036   int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_MINMAX_MIN_COMPARE_SIGN
1037                                                          : AVX10_MINMAX_MAX_COMPARE_SIGN;
1038   if (elem_bt == T_FLOAT) {
1039     evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc);
1040   } else {
1041     assert(elem_bt == T_DOUBLE, "");
1042     evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc);
1043   }
1044 }
1045 
1046 // Float/Double signum
1047 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1048   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1049 
1050   Label DONE_LABEL;
1051 
1052   if (opcode == Op_SignumF) {
1053     ucomiss(dst, zero);
1054     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1055     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1056     movflt(dst, one);
1057     jcc(Assembler::above, DONE_LABEL);
1058     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1059   } else if (opcode == Op_SignumD) {
1060     ucomisd(dst, zero);
1061     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1062     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1063     movdbl(dst, one);
1064     jcc(Assembler::above, DONE_LABEL);
1065     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1066   }
1067 
1068   bind(DONE_LABEL);
1069 }
1070 
1071 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1072   if (sign) {
1073     pmovsxbw(dst, src);
1074   } else {
1075     pmovzxbw(dst, src);
1076   }
1077 }
1078 
1079 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1080   if (sign) {
1081     vpmovsxbw(dst, src, vector_len);
1082   } else {
1083     vpmovzxbw(dst, src, vector_len);
1084   }
1085 }
1086 
1087 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1088   if (sign) {
1089     vpmovsxbd(dst, src, vector_len);
1090   } else {
1091     vpmovzxbd(dst, src, vector_len);
1092   }
1093 }
1094 
1095 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1096   if (sign) {
1097     vpmovsxwd(dst, src, vector_len);
1098   } else {
1099     vpmovzxwd(dst, src, vector_len);
1100   }
1101 }
1102 
1103 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1104                                      int shift, int vector_len) {
1105   if (opcode == Op_RotateLeftV) {
1106     if (etype == T_INT) {
1107       evprold(dst, src, shift, vector_len);
1108     } else {
1109       assert(etype == T_LONG, "expected type T_LONG");
1110       evprolq(dst, src, shift, vector_len);
1111     }
1112   } else {
1113     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1114     if (etype == T_INT) {
1115       evprord(dst, src, shift, vector_len);
1116     } else {
1117       assert(etype == T_LONG, "expected type T_LONG");
1118       evprorq(dst, src, shift, vector_len);
1119     }
1120   }
1121 }
1122 
1123 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1124                                      XMMRegister shift, int vector_len) {
1125   if (opcode == Op_RotateLeftV) {
1126     if (etype == T_INT) {
1127       evprolvd(dst, src, shift, vector_len);
1128     } else {
1129       assert(etype == T_LONG, "expected type T_LONG");
1130       evprolvq(dst, src, shift, vector_len);
1131     }
1132   } else {
1133     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1134     if (etype == T_INT) {
1135       evprorvd(dst, src, shift, vector_len);
1136     } else {
1137       assert(etype == T_LONG, "expected type T_LONG");
1138       evprorvq(dst, src, shift, vector_len);
1139     }
1140   }
1141 }
1142 
1143 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1144   if (opcode == Op_RShiftVI) {
1145     psrad(dst, shift);
1146   } else if (opcode == Op_LShiftVI) {
1147     pslld(dst, shift);
1148   } else {
1149     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1150     psrld(dst, shift);
1151   }
1152 }
1153 
1154 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1155   switch (opcode) {
1156     case Op_RShiftVI:  psrad(dst, shift); break;
1157     case Op_LShiftVI:  pslld(dst, shift); break;
1158     case Op_URShiftVI: psrld(dst, shift); break;
1159 
1160     default: assert(false, "%s", NodeClassNames[opcode]);
1161   }
1162 }
1163 
1164 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1165   if (opcode == Op_RShiftVI) {
1166     vpsrad(dst, nds, shift, vector_len);
1167   } else if (opcode == Op_LShiftVI) {
1168     vpslld(dst, nds, shift, vector_len);
1169   } else {
1170     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1171     vpsrld(dst, nds, shift, vector_len);
1172   }
1173 }
1174 
1175 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1176   switch (opcode) {
1177     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1178     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1179     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1180 
1181     default: assert(false, "%s", NodeClassNames[opcode]);
1182   }
1183 }
1184 
1185 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1186   switch (opcode) {
1187     case Op_RShiftVB:  // fall-through
1188     case Op_RShiftVS:  psraw(dst, shift); break;
1189 
1190     case Op_LShiftVB:  // fall-through
1191     case Op_LShiftVS:  psllw(dst, shift);   break;
1192 
1193     case Op_URShiftVS: // fall-through
1194     case Op_URShiftVB: psrlw(dst, shift);  break;
1195 
1196     default: assert(false, "%s", NodeClassNames[opcode]);
1197   }
1198 }
1199 
1200 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1201   switch (opcode) {
1202     case Op_RShiftVB:  // fall-through
1203     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1204 
1205     case Op_LShiftVB:  // fall-through
1206     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1207 
1208     case Op_URShiftVS: // fall-through
1209     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1210 
1211     default: assert(false, "%s", NodeClassNames[opcode]);
1212   }
1213 }
1214 
1215 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1216   switch (opcode) {
1217     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1218     case Op_LShiftVL:  psllq(dst, shift); break;
1219     case Op_URShiftVL: psrlq(dst, shift); break;
1220 
1221     default: assert(false, "%s", NodeClassNames[opcode]);
1222   }
1223 }
1224 
1225 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1226   if (opcode == Op_RShiftVL) {
1227     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1228   } else if (opcode == Op_LShiftVL) {
1229     psllq(dst, shift);
1230   } else {
1231     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1232     psrlq(dst, shift);
1233   }
1234 }
1235 
1236 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1237   switch (opcode) {
1238     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1239     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1240     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1241 
1242     default: assert(false, "%s", NodeClassNames[opcode]);
1243   }
1244 }
1245 
1246 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1247   if (opcode == Op_RShiftVL) {
1248     evpsraq(dst, nds, shift, vector_len);
1249   } else if (opcode == Op_LShiftVL) {
1250     vpsllq(dst, nds, shift, vector_len);
1251   } else {
1252     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1253     vpsrlq(dst, nds, shift, vector_len);
1254   }
1255 }
1256 
1257 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1258   switch (opcode) {
1259     case Op_RShiftVB:  // fall-through
1260     case Op_RShiftVS:  // fall-through
1261     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1262 
1263     case Op_LShiftVB:  // fall-through
1264     case Op_LShiftVS:  // fall-through
1265     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1266 
1267     case Op_URShiftVB: // fall-through
1268     case Op_URShiftVS: // fall-through
1269     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1270 
1271     default: assert(false, "%s", NodeClassNames[opcode]);
1272   }
1273 }
1274 
1275 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1276   switch (opcode) {
1277     case Op_RShiftVB:  // fall-through
1278     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1279 
1280     case Op_LShiftVB:  // fall-through
1281     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1282 
1283     case Op_URShiftVB: // fall-through
1284     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1285 
1286     default: assert(false, "%s", NodeClassNames[opcode]);
1287   }
1288 }
1289 
1290 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1291   assert(UseAVX >= 2, "required");
1292   switch (opcode) {
1293     case Op_RShiftVL: {
1294       if (UseAVX > 2) {
1295         assert(tmp == xnoreg, "not used");
1296         if (!VM_Version::supports_avx512vl()) {
1297           vlen_enc = Assembler::AVX_512bit;
1298         }
1299         evpsravq(dst, src, shift, vlen_enc);
1300       } else {
1301         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1302         vpsrlvq(dst, src, shift, vlen_enc);
1303         vpsrlvq(tmp, tmp, shift, vlen_enc);
1304         vpxor(dst, dst, tmp, vlen_enc);
1305         vpsubq(dst, dst, tmp, vlen_enc);
1306       }
1307       break;
1308     }
1309     case Op_LShiftVL: {
1310       assert(tmp == xnoreg, "not used");
1311       vpsllvq(dst, src, shift, vlen_enc);
1312       break;
1313     }
1314     case Op_URShiftVL: {
1315       assert(tmp == xnoreg, "not used");
1316       vpsrlvq(dst, src, shift, vlen_enc);
1317       break;
1318     }
1319     default: assert(false, "%s", NodeClassNames[opcode]);
1320   }
1321 }
1322 
1323 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1324 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1325   assert(opcode == Op_LShiftVB ||
1326          opcode == Op_RShiftVB ||
1327          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1328   bool sign = (opcode != Op_URShiftVB);
1329   assert(vector_len == 0, "required");
1330   vextendbd(sign, dst, src, 1);
1331   vpmovzxbd(vtmp, shift, 1);
1332   varshiftd(opcode, dst, dst, vtmp, 1);
1333   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1334   vextracti128_high(vtmp, dst);
1335   vpackusdw(dst, dst, vtmp, 0);
1336 }
1337 
1338 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1339 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1340   assert(opcode == Op_LShiftVB ||
1341          opcode == Op_RShiftVB ||
1342          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1343   bool sign = (opcode != Op_URShiftVB);
1344   int ext_vector_len = vector_len + 1;
1345   vextendbw(sign, dst, src, ext_vector_len);
1346   vpmovzxbw(vtmp, shift, ext_vector_len);
1347   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1348   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1349   if (vector_len == 0) {
1350     vextracti128_high(vtmp, dst);
1351     vpackuswb(dst, dst, vtmp, vector_len);
1352   } else {
1353     vextracti64x4_high(vtmp, dst);
1354     vpackuswb(dst, dst, vtmp, vector_len);
1355     vpermq(dst, dst, 0xD8, vector_len);
1356   }
1357 }
1358 
1359 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1360   switch(typ) {
1361     case T_BYTE:
1362       pinsrb(dst, val, idx);
1363       break;
1364     case T_SHORT:
1365       pinsrw(dst, val, idx);
1366       break;
1367     case T_INT:
1368       pinsrd(dst, val, idx);
1369       break;
1370     case T_LONG:
1371       pinsrq(dst, val, idx);
1372       break;
1373     default:
1374       assert(false,"Should not reach here.");
1375       break;
1376   }
1377 }
1378 
1379 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1380   switch(typ) {
1381     case T_BYTE:
1382       vpinsrb(dst, src, val, idx);
1383       break;
1384     case T_SHORT:
1385       vpinsrw(dst, src, val, idx);
1386       break;
1387     case T_INT:
1388       vpinsrd(dst, src, val, idx);
1389       break;
1390     case T_LONG:
1391       vpinsrq(dst, src, val, idx);
1392       break;
1393     default:
1394       assert(false,"Should not reach here.");
1395       break;
1396   }
1397 }
1398 
1399 void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst,
1400                                          Register base, Register idx_base,
1401                                          Register mask, Register mask_idx,
1402                                          Register rtmp, int vlen_enc) {
1403   vpxor(dst, dst, dst, vlen_enc);
1404   if (elem_bt == T_SHORT) {
1405     for (int i = 0; i < 4; i++) {
1406       // dst[i] = mask[i] ? src[idx_base[i]] : 0
1407       Label skip_load;
1408       btq(mask, mask_idx);
1409       jccb(Assembler::carryClear, skip_load);
1410       movl(rtmp, Address(idx_base, i * 4));
1411       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1412       bind(skip_load);
1413       incq(mask_idx);
1414     }
1415   } else {
1416     assert(elem_bt == T_BYTE, "");
1417     for (int i = 0; i < 8; i++) {
1418       // dst[i] = mask[i] ? src[idx_base[i]] : 0
1419       Label skip_load;
1420       btq(mask, mask_idx);
1421       jccb(Assembler::carryClear, skip_load);
1422       movl(rtmp, Address(idx_base, i * 4));
1423       pinsrb(dst, Address(base, rtmp), i);
1424       bind(skip_load);
1425       incq(mask_idx);
1426     }
1427   }
1428 }
1429 
1430 void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst,
1431                                   Register base, Register idx_base,
1432                                   Register rtmp, int vlen_enc) {
1433   vpxor(dst, dst, dst, vlen_enc);
1434   if (elem_bt == T_SHORT) {
1435     for (int i = 0; i < 4; i++) {
1436       // dst[i] = src[idx_base[i]]
1437       movl(rtmp, Address(idx_base, i * 4));
1438       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1439     }
1440   } else {
1441     assert(elem_bt == T_BYTE, "");
1442     for (int i = 0; i < 8; i++) {
1443       // dst[i] = src[idx_base[i]]
1444       movl(rtmp, Address(idx_base, i * 4));
1445       pinsrb(dst, Address(base, rtmp), i);
1446     }
1447   }
1448 }
1449 
1450 /*
1451  * Gather using hybrid algorithm, first partially unroll scalar loop
1452  * to accumulate values from gather indices into a quad-word(64bit) slice.
1453  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1454  * permutation to place the slice into appropriate vector lane
1455  * locations in destination vector. Following pseudo code describes the
1456  * algorithm in detail:
1457  *
1458  * DST_VEC = ZERO_VEC
1459  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1460  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1461  * FOREACH_ITER:
1462  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1463  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1464  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1465  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1466  *
1467  * With each iteration, doubleword permute indices (0,1) corresponding
1468  * to gathered quadword gets right shifted by two lane positions.
1469  *
1470  */
1471 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1472                                         Register base, Register idx_base,
1473                                         Register mask, XMMRegister xtmp1,
1474                                         XMMRegister xtmp2, XMMRegister temp_dst,
1475                                         Register rtmp, Register mask_idx,
1476                                         Register length, int vector_len, int vlen_enc) {
1477   Label GATHER8_LOOP;
1478   assert(is_subword_type(elem_ty), "");
1479   movl(length, vector_len);
1480   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1481   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1482   vallones(xtmp2, vlen_enc);
1483   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1484   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1485   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1486 
1487   bind(GATHER8_LOOP);
1488     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1489     if (mask == noreg) {
1490       vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc);
1491     } else {
1492       vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc);
1493     }
1494     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1495     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1496     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1497     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1498     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1499     vpor(dst, dst, temp_dst, vlen_enc);
1500     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1501     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1502     jcc(Assembler::notEqual, GATHER8_LOOP);
1503 }
1504 
1505 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1506   switch(typ) {
1507     case T_INT:
1508       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1509       break;
1510     case T_FLOAT:
1511       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1512       break;
1513     case T_LONG:
1514       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1515       break;
1516     case T_DOUBLE:
1517       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1518       break;
1519     default:
1520       assert(false,"Should not reach here.");
1521       break;
1522   }
1523 }
1524 
1525 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1526   switch(typ) {
1527     case T_INT:
1528       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1529       break;
1530     case T_FLOAT:
1531       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1532       break;
1533     case T_LONG:
1534       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1535       break;
1536     case T_DOUBLE:
1537       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1538       break;
1539     default:
1540       assert(false,"Should not reach here.");
1541       break;
1542   }
1543 }
1544 
1545 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1546   switch(typ) {
1547     case T_INT:
1548       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1549       break;
1550     case T_FLOAT:
1551       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1552       break;
1553     case T_LONG:
1554       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1555       break;
1556     case T_DOUBLE:
1557       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1558       break;
1559     default:
1560       assert(false,"Should not reach here.");
1561       break;
1562   }
1563 }
1564 
1565 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1566   if (vlen_in_bytes <= 16) {
1567     pxor (dst, dst);
1568     psubb(dst, src);
1569     switch (elem_bt) {
1570       case T_BYTE:   /* nothing to do */ break;
1571       case T_SHORT:  pmovsxbw(dst, dst); break;
1572       case T_INT:    pmovsxbd(dst, dst); break;
1573       case T_FLOAT:  pmovsxbd(dst, dst); break;
1574       case T_LONG:   pmovsxbq(dst, dst); break;
1575       case T_DOUBLE: pmovsxbq(dst, dst); break;
1576 
1577       default: assert(false, "%s", type2name(elem_bt));
1578     }
1579   } else {
1580     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1581     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1582 
1583     vpxor (dst, dst, dst, vlen_enc);
1584     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1585 
1586     switch (elem_bt) {
1587       case T_BYTE:   /* nothing to do */            break;
1588       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1589       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1590       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1591       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1592       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1593 
1594       default: assert(false, "%s", type2name(elem_bt));
1595     }
1596   }
1597 }
1598 
1599 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1600   if (novlbwdq) {
1601     vpmovsxbd(xtmp, src, vlen_enc);
1602     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1603             Assembler::eq, true, vlen_enc, noreg);
1604   } else {
1605     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1606     vpsubb(xtmp, xtmp, src, vlen_enc);
1607     evpmovb2m(dst, xtmp, vlen_enc);
1608   }
1609 }
1610 
1611 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1612   if (is_integral_type(bt)) {
1613     switch (vlen_in_bytes) {
1614       case 4:  movdl(dst, src);   break;
1615       case 8:  movq(dst, src);    break;
1616       case 16: movdqu(dst, src);  break;
1617       case 32: vmovdqu(dst, src); break;
1618       case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1619       default: ShouldNotReachHere();
1620     }
1621   } else {
1622     switch (vlen_in_bytes) {
1623       case 4:  movflt(dst, src); break;
1624       case 8:  movdbl(dst, src); break;
1625       case 16: movups(dst, src); break;
1626       case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1627       case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1628       default: ShouldNotReachHere();
1629     }
1630   }
1631 }
1632 
1633 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1634   assert(rscratch != noreg || always_reachable(src), "missing");
1635 
1636   if (reachable(src)) {
1637     load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1638   } else {
1639     lea(rscratch, src);
1640     load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1641   }
1642 }
1643 
1644 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1645   int vlen_enc = vector_length_encoding(vlen);
1646   if (VM_Version::supports_avx()) {
1647     if (bt == T_LONG) {
1648       if (VM_Version::supports_avx2()) {
1649         vpbroadcastq(dst, src, vlen_enc);
1650       } else {
1651         vmovddup(dst, src, vlen_enc);
1652       }
1653     } else if (bt == T_DOUBLE) {
1654       if (vlen_enc != Assembler::AVX_128bit) {
1655         vbroadcastsd(dst, src, vlen_enc, noreg);
1656       } else {
1657         vmovddup(dst, src, vlen_enc);
1658       }
1659     } else {
1660       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1661         vpbroadcastd(dst, src, vlen_enc);
1662       } else {
1663         vbroadcastss(dst, src, vlen_enc);
1664       }
1665     }
1666   } else if (VM_Version::supports_sse3()) {
1667     movddup(dst, src);
1668   } else {
1669     load_vector(bt, dst, src, vlen);
1670   }
1671 }
1672 
1673 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1674   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1675   int offset = exact_log2(type2aelembytes(bt)) << 6;
1676   if (is_floating_point_type(bt)) {
1677     offset += 128;
1678   }
1679   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1680   load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1681 }
1682 
1683 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1684 
1685 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1686   int vector_len = Assembler::AVX_128bit;
1687 
1688   switch (opcode) {
1689     case Op_AndReductionV:  pand(dst, src); break;
1690     case Op_OrReductionV:   por (dst, src); break;
1691     case Op_XorReductionV:  pxor(dst, src); break;
1692     case Op_MinReductionV:
1693       switch (typ) {
1694         case T_BYTE:        pminsb(dst, src); break;
1695         case T_SHORT:       pminsw(dst, src); break;
1696         case T_INT:         pminsd(dst, src); break;
1697         case T_LONG:        assert(UseAVX > 2, "required");
1698                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1699         default:            assert(false, "wrong type");
1700       }
1701       break;
1702     case Op_MaxReductionV:
1703       switch (typ) {
1704         case T_BYTE:        pmaxsb(dst, src); break;
1705         case T_SHORT:       pmaxsw(dst, src); break;
1706         case T_INT:         pmaxsd(dst, src); break;
1707         case T_LONG:        assert(UseAVX > 2, "required");
1708                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1709         default:            assert(false, "wrong type");
1710       }
1711       break;
1712     case Op_AddReductionVF: addss(dst, src); break;
1713     case Op_AddReductionVD: addsd(dst, src); break;
1714     case Op_AddReductionVI:
1715       switch (typ) {
1716         case T_BYTE:        paddb(dst, src); break;
1717         case T_SHORT:       paddw(dst, src); break;
1718         case T_INT:         paddd(dst, src); break;
1719         default:            assert(false, "wrong type");
1720       }
1721       break;
1722     case Op_AddReductionVL: paddq(dst, src); break;
1723     case Op_MulReductionVF: mulss(dst, src); break;
1724     case Op_MulReductionVD: mulsd(dst, src); break;
1725     case Op_MulReductionVI:
1726       switch (typ) {
1727         case T_SHORT:       pmullw(dst, src); break;
1728         case T_INT:         pmulld(dst, src); break;
1729         default:            assert(false, "wrong type");
1730       }
1731       break;
1732     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1733                             evpmullq(dst, dst, src, vector_len); break;
1734     default:                assert(false, "wrong opcode");
1735   }
1736 }
1737 
1738 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1739   switch (opcode) {
1740     case Op_AddReductionVF: addps(dst, src); break;
1741     case Op_AddReductionVD: addpd(dst, src); break;
1742     case Op_MulReductionVF: mulps(dst, src); break;
1743     case Op_MulReductionVD: mulpd(dst, src); break;
1744     default:                assert(false, "%s", NodeClassNames[opcode]);
1745   }
1746 }
1747 
1748 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1749   int vector_len = Assembler::AVX_256bit;
1750 
1751   switch (opcode) {
1752     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1753     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1754     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1755     case Op_MinReductionV:
1756       switch (typ) {
1757         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1758         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1759         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1760         case T_LONG:        assert(UseAVX > 2, "required");
1761                             vpminsq(dst, src1, src2, vector_len); break;
1762         default:            assert(false, "wrong type");
1763       }
1764       break;
1765     case Op_MaxReductionV:
1766       switch (typ) {
1767         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1768         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1769         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1770         case T_LONG:        assert(UseAVX > 2, "required");
1771                             vpmaxsq(dst, src1, src2, vector_len); break;
1772         default:            assert(false, "wrong type");
1773       }
1774       break;
1775     case Op_AddReductionVI:
1776       switch (typ) {
1777         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1778         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1779         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1780         default:            assert(false, "wrong type");
1781       }
1782       break;
1783     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1784     case Op_MulReductionVI:
1785       switch (typ) {
1786         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1787         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1788         default:            assert(false, "wrong type");
1789       }
1790       break;
1791     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1792     default:                assert(false, "wrong opcode");
1793   }
1794 }
1795 
1796 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1797   int vector_len = Assembler::AVX_256bit;
1798 
1799   switch (opcode) {
1800     case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1801     case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1802     case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1803     case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1804     default:                assert(false, "%s", NodeClassNames[opcode]);
1805   }
1806 }
1807 
1808 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1809                                   XMMRegister dst, XMMRegister src,
1810                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1811   switch (opcode) {
1812     case Op_AddReductionVF:
1813     case Op_MulReductionVF:
1814       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1815       break;
1816 
1817     case Op_AddReductionVD:
1818     case Op_MulReductionVD:
1819       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1820       break;
1821 
1822     default: assert(false, "wrong opcode");
1823   }
1824 }
1825 
1826 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1827                                             XMMRegister dst, XMMRegister src,
1828                                             XMMRegister vtmp1, XMMRegister vtmp2) {
1829   switch (opcode) {
1830     case Op_AddReductionVF:
1831     case Op_MulReductionVF:
1832       unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1833       break;
1834 
1835     case Op_AddReductionVD:
1836     case Op_MulReductionVD:
1837       unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1838       break;
1839 
1840     default: assert(false, "%s", NodeClassNames[opcode]);
1841   }
1842 }
1843 
1844 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1845                              Register dst, Register src1, XMMRegister src2,
1846                              XMMRegister vtmp1, XMMRegister vtmp2) {
1847   switch (vlen) {
1848     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1849     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1850     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1851     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1852 
1853     default: assert(false, "wrong vector length");
1854   }
1855 }
1856 
1857 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1858                              Register dst, Register src1, XMMRegister src2,
1859                              XMMRegister vtmp1, XMMRegister vtmp2) {
1860   switch (vlen) {
1861     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1862     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1863     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1864     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1865 
1866     default: assert(false, "wrong vector length");
1867   }
1868 }
1869 
1870 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1871                              Register dst, Register src1, XMMRegister src2,
1872                              XMMRegister vtmp1, XMMRegister vtmp2) {
1873   switch (vlen) {
1874     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1875     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1876     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1877     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1878 
1879     default: assert(false, "wrong vector length");
1880   }
1881 }
1882 
1883 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1884                              Register dst, Register src1, XMMRegister src2,
1885                              XMMRegister vtmp1, XMMRegister vtmp2) {
1886   switch (vlen) {
1887     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1888     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1889     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1890     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1891 
1892     default: assert(false, "wrong vector length");
1893   }
1894 }
1895 
1896 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1897                              Register dst, Register src1, XMMRegister src2,
1898                              XMMRegister vtmp1, XMMRegister vtmp2) {
1899   switch (vlen) {
1900     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1901     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1902     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1903 
1904     default: assert(false, "wrong vector length");
1905   }
1906 }
1907 
1908 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1909   switch (vlen) {
1910     case 2:
1911       assert(vtmp2 == xnoreg, "");
1912       reduce2F(opcode, dst, src, vtmp1);
1913       break;
1914     case 4:
1915       assert(vtmp2 == xnoreg, "");
1916       reduce4F(opcode, dst, src, vtmp1);
1917       break;
1918     case 8:
1919       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1920       break;
1921     case 16:
1922       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1923       break;
1924     default: assert(false, "wrong vector length");
1925   }
1926 }
1927 
1928 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1929   switch (vlen) {
1930     case 2:
1931       assert(vtmp2 == xnoreg, "");
1932       reduce2D(opcode, dst, src, vtmp1);
1933       break;
1934     case 4:
1935       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1936       break;
1937     case 8:
1938       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1939       break;
1940     default: assert(false, "wrong vector length");
1941   }
1942 }
1943 
1944 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1945   switch (vlen) {
1946     case 2:
1947       assert(vtmp1 == xnoreg, "");
1948       assert(vtmp2 == xnoreg, "");
1949       unorderedReduce2F(opcode, dst, src);
1950       break;
1951     case 4:
1952       assert(vtmp2 == xnoreg, "");
1953       unorderedReduce4F(opcode, dst, src, vtmp1);
1954       break;
1955     case 8:
1956       unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
1957       break;
1958     case 16:
1959       unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
1960       break;
1961     default: assert(false, "wrong vector length");
1962   }
1963 }
1964 
1965 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1966   switch (vlen) {
1967     case 2:
1968       assert(vtmp1 == xnoreg, "");
1969       assert(vtmp2 == xnoreg, "");
1970       unorderedReduce2D(opcode, dst, src);
1971       break;
1972     case 4:
1973       assert(vtmp2 == xnoreg, "");
1974       unorderedReduce4D(opcode, dst, src, vtmp1);
1975       break;
1976     case 8:
1977       unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
1978       break;
1979     default: assert(false, "wrong vector length");
1980   }
1981 }
1982 
1983 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1984   if (opcode == Op_AddReductionVI) {
1985     if (vtmp1 != src2) {
1986       movdqu(vtmp1, src2);
1987     }
1988     phaddd(vtmp1, vtmp1);
1989   } else {
1990     pshufd(vtmp1, src2, 0x1);
1991     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1992   }
1993   movdl(vtmp2, src1);
1994   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1995   movdl(dst, vtmp1);
1996 }
1997 
1998 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1999   if (opcode == Op_AddReductionVI) {
2000     if (vtmp1 != src2) {
2001       movdqu(vtmp1, src2);
2002     }
2003     phaddd(vtmp1, src2);
2004     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2005   } else {
2006     pshufd(vtmp2, src2, 0xE);
2007     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2008     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2009   }
2010 }
2011 
2012 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2013   if (opcode == Op_AddReductionVI) {
2014     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2015     vextracti128_high(vtmp2, vtmp1);
2016     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2017     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2018   } else {
2019     vextracti128_high(vtmp1, src2);
2020     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2021     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2022   }
2023 }
2024 
2025 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2026   vextracti64x4_high(vtmp2, src2);
2027   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2028   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2029 }
2030 
2031 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2032   pshufd(vtmp2, src2, 0x1);
2033   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2034   movdqu(vtmp1, vtmp2);
2035   psrldq(vtmp1, 2);
2036   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2037   movdqu(vtmp2, vtmp1);
2038   psrldq(vtmp2, 1);
2039   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2040   movdl(vtmp2, src1);
2041   pmovsxbd(vtmp1, vtmp1);
2042   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2043   pextrb(dst, vtmp1, 0x0);
2044   movsbl(dst, dst);
2045 }
2046 
2047 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2048   pshufd(vtmp1, src2, 0xE);
2049   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2050   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2051 }
2052 
2053 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2054   vextracti128_high(vtmp2, src2);
2055   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2056   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2057 }
2058 
2059 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2060   vextracti64x4_high(vtmp1, src2);
2061   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2062   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2063 }
2064 
2065 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2066   pmovsxbw(vtmp2, src2);
2067   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2068 }
2069 
2070 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2071   if (UseAVX > 1) {
2072     int vector_len = Assembler::AVX_256bit;
2073     vpmovsxbw(vtmp1, src2, vector_len);
2074     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2075   } else {
2076     pmovsxbw(vtmp2, src2);
2077     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2078     pshufd(vtmp2, src2, 0x1);
2079     pmovsxbw(vtmp2, src2);
2080     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2081   }
2082 }
2083 
2084 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2085   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2086     int vector_len = Assembler::AVX_512bit;
2087     vpmovsxbw(vtmp1, src2, vector_len);
2088     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2089   } else {
2090     assert(UseAVX >= 2,"Should not reach here.");
2091     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2092     vextracti128_high(vtmp2, src2);
2093     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2094   }
2095 }
2096 
2097 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2098   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2099   vextracti64x4_high(vtmp2, src2);
2100   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2101 }
2102 
2103 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2104   if (opcode == Op_AddReductionVI) {
2105     if (vtmp1 != src2) {
2106       movdqu(vtmp1, src2);
2107     }
2108     phaddw(vtmp1, vtmp1);
2109     phaddw(vtmp1, vtmp1);
2110   } else {
2111     pshufd(vtmp2, src2, 0x1);
2112     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2113     movdqu(vtmp1, vtmp2);
2114     psrldq(vtmp1, 2);
2115     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2116   }
2117   movdl(vtmp2, src1);
2118   pmovsxwd(vtmp1, vtmp1);
2119   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2120   pextrw(dst, vtmp1, 0x0);
2121   movswl(dst, dst);
2122 }
2123 
2124 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2125   if (opcode == Op_AddReductionVI) {
2126     if (vtmp1 != src2) {
2127       movdqu(vtmp1, src2);
2128     }
2129     phaddw(vtmp1, src2);
2130   } else {
2131     pshufd(vtmp1, src2, 0xE);
2132     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2133   }
2134   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2135 }
2136 
2137 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2138   if (opcode == Op_AddReductionVI) {
2139     int vector_len = Assembler::AVX_256bit;
2140     vphaddw(vtmp2, src2, src2, vector_len);
2141     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2142   } else {
2143     vextracti128_high(vtmp2, src2);
2144     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2145   }
2146   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2147 }
2148 
2149 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2150   int vector_len = Assembler::AVX_256bit;
2151   vextracti64x4_high(vtmp1, src2);
2152   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2153   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2154 }
2155 
2156 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2157   pshufd(vtmp2, src2, 0xE);
2158   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2159   movdq(vtmp1, src1);
2160   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2161   movdq(dst, vtmp1);
2162 }
2163 
2164 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2165   vextracti128_high(vtmp1, src2);
2166   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2167   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2168 }
2169 
2170 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2171   vextracti64x4_high(vtmp2, src2);
2172   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2173   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2174 }
2175 
2176 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2177   mov64(temp, -1L);
2178   bzhiq(temp, temp, len);
2179   kmovql(dst, temp);
2180 }
2181 
2182 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2183   reduce_operation_128(T_FLOAT, opcode, dst, src);
2184   pshufd(vtmp, src, 0x1);
2185   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2186 }
2187 
2188 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2189   reduce2F(opcode, dst, src, vtmp);
2190   pshufd(vtmp, src, 0x2);
2191   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2192   pshufd(vtmp, src, 0x3);
2193   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2194 }
2195 
2196 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2197   reduce4F(opcode, dst, src, vtmp2);
2198   vextractf128_high(vtmp2, src);
2199   reduce4F(opcode, dst, vtmp2, vtmp1);
2200 }
2201 
2202 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2203   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2204   vextracti64x4_high(vtmp1, src);
2205   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2206 }
2207 
2208 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2209   pshufd(dst, src, 0x1);
2210   reduce_operation_128(T_FLOAT, opcode, dst, src);
2211 }
2212 
2213 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2214   pshufd(vtmp, src, 0xE);
2215   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2216   unorderedReduce2F(opcode, dst, vtmp);
2217 }
2218 
2219 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2220   vextractf128_high(vtmp1, src);
2221   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2222   unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2223 }
2224 
2225 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2226   vextractf64x4_high(vtmp2, src);
2227   unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2228   unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2229 }
2230 
2231 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2232   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2233   pshufd(vtmp, src, 0xE);
2234   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2235 }
2236 
2237 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2238   reduce2D(opcode, dst, src, vtmp2);
2239   vextractf128_high(vtmp2, src);
2240   reduce2D(opcode, dst, vtmp2, vtmp1);
2241 }
2242 
2243 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2244   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2245   vextracti64x4_high(vtmp1, src);
2246   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2247 }
2248 
2249 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2250   pshufd(dst, src, 0xE);
2251   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2252 }
2253 
2254 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2255   vextractf128_high(vtmp, src);
2256   unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2257   unorderedReduce2D(opcode, dst, vtmp);
2258 }
2259 
2260 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2261   vextractf64x4_high(vtmp2, src);
2262   unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2263   unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2264 }
2265 
2266 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2267   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2268 }
2269 
2270 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2271   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2272 }
2273 
2274 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2275   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2276 }
2277 
2278 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2279                                  int vec_enc) {
2280   switch(elem_bt) {
2281     case T_INT:
2282     case T_FLOAT:
2283       vmaskmovps(dst, src, mask, vec_enc);
2284       break;
2285     case T_LONG:
2286     case T_DOUBLE:
2287       vmaskmovpd(dst, src, mask, vec_enc);
2288       break;
2289     default:
2290       fatal("Unsupported type %s", type2name(elem_bt));
2291       break;
2292   }
2293 }
2294 
2295 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2296                                  int vec_enc) {
2297   switch(elem_bt) {
2298     case T_INT:
2299     case T_FLOAT:
2300       vmaskmovps(dst, src, mask, vec_enc);
2301       break;
2302     case T_LONG:
2303     case T_DOUBLE:
2304       vmaskmovpd(dst, src, mask, vec_enc);
2305       break;
2306     default:
2307       fatal("Unsupported type %s", type2name(elem_bt));
2308       break;
2309   }
2310 }
2311 
2312 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2313                                           XMMRegister dst, XMMRegister src,
2314                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2315                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2316   const int permconst[] = {1, 14};
2317   XMMRegister wsrc = src;
2318   XMMRegister wdst = xmm_0;
2319   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2320 
2321   int vlen_enc = Assembler::AVX_128bit;
2322   if (vlen == 16) {
2323     vlen_enc = Assembler::AVX_256bit;
2324   }
2325 
2326   for (int i = log2(vlen) - 1; i >=0; i--) {
2327     if (i == 0 && !is_dst_valid) {
2328       wdst = dst;
2329     }
2330     if (i == 3) {
2331       vextracti64x4_high(wtmp, wsrc);
2332     } else if (i == 2) {
2333       vextracti128_high(wtmp, wsrc);
2334     } else { // i = [0,1]
2335       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2336     }
2337 
2338     if (VM_Version::supports_avx10_2()) {
2339       vminmax_fp(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc);
2340     } else {
2341       vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2342     }
2343     wsrc = wdst;
2344     vlen_enc = Assembler::AVX_128bit;
2345   }
2346   if (is_dst_valid) {
2347     if (VM_Version::supports_avx10_2()) {
2348       vminmax_fp(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit);
2349     } else {
2350       vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2351     }
2352   }
2353 }
2354 
2355 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2356                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2357                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2358   XMMRegister wsrc = src;
2359   XMMRegister wdst = xmm_0;
2360   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2361   int vlen_enc = Assembler::AVX_128bit;
2362   if (vlen == 8) {
2363     vlen_enc = Assembler::AVX_256bit;
2364   }
2365   for (int i = log2(vlen) - 1; i >=0; i--) {
2366     if (i == 0 && !is_dst_valid) {
2367       wdst = dst;
2368     }
2369     if (i == 1) {
2370       vextracti128_high(wtmp, wsrc);
2371     } else if (i == 2) {
2372       vextracti64x4_high(wtmp, wsrc);
2373     } else {
2374       assert(i == 0, "%d", i);
2375       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2376     }
2377 
2378     if (VM_Version::supports_avx10_2()) {
2379       vminmax_fp(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc);
2380     } else {
2381       vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2382     }
2383 
2384     wsrc = wdst;
2385     vlen_enc = Assembler::AVX_128bit;
2386   }
2387 
2388   if (is_dst_valid) {
2389     if (VM_Version::supports_avx10_2()) {
2390       vminmax_fp(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit);
2391     } else {
2392       vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2393     }
2394   }
2395 }
2396 
2397 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2398   switch (bt) {
2399     case T_BYTE:  pextrb(dst, src, idx); break;
2400     case T_SHORT: pextrw(dst, src, idx); break;
2401     case T_INT:   pextrd(dst, src, idx); break;
2402     case T_LONG:  pextrq(dst, src, idx); break;
2403 
2404     default:
2405       assert(false,"Should not reach here.");
2406       break;
2407   }
2408 }
2409 
2410 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2411   int esize =  type2aelembytes(typ);
2412   int elem_per_lane = 16/esize;
2413   int lane = elemindex / elem_per_lane;
2414   int eindex = elemindex % elem_per_lane;
2415 
2416   if (lane >= 2) {
2417     assert(UseAVX > 2, "required");
2418     vextractf32x4(dst, src, lane & 3);
2419     return dst;
2420   } else if (lane > 0) {
2421     assert(UseAVX > 0, "required");
2422     vextractf128(dst, src, lane);
2423     return dst;
2424   } else {
2425     return src;
2426   }
2427 }
2428 
2429 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2430   if (typ == T_BYTE) {
2431     movsbl(dst, dst);
2432   } else if (typ == T_SHORT) {
2433     movswl(dst, dst);
2434   }
2435 }
2436 
2437 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2438   int esize =  type2aelembytes(typ);
2439   int elem_per_lane = 16/esize;
2440   int eindex = elemindex % elem_per_lane;
2441   assert(is_integral_type(typ),"required");
2442 
2443   if (eindex == 0) {
2444     if (typ == T_LONG) {
2445       movq(dst, src);
2446     } else {
2447       movdl(dst, src);
2448       movsxl(typ, dst);
2449     }
2450   } else {
2451     extract(typ, dst, src, eindex);
2452     movsxl(typ, dst);
2453   }
2454 }
2455 
2456 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2457   int esize =  type2aelembytes(typ);
2458   int elem_per_lane = 16/esize;
2459   int eindex = elemindex % elem_per_lane;
2460   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2461 
2462   if (eindex == 0) {
2463     movq(dst, src);
2464   } else {
2465     if (typ == T_FLOAT) {
2466       if (UseAVX == 0) {
2467         movdqu(dst, src);
2468         shufps(dst, dst, eindex);
2469       } else {
2470         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2471       }
2472     } else {
2473       if (UseAVX == 0) {
2474         movdqu(dst, src);
2475         psrldq(dst, eindex*esize);
2476       } else {
2477         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2478       }
2479       movq(dst, dst);
2480     }
2481   }
2482   // Zero upper bits
2483   if (typ == T_FLOAT) {
2484     if (UseAVX == 0) {
2485       assert(vtmp != xnoreg, "required.");
2486       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2487       pand(dst, vtmp);
2488     } else {
2489       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2490     }
2491   }
2492 }
2493 
2494 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2495   switch(typ) {
2496     case T_BYTE:
2497     case T_BOOLEAN:
2498       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2499       break;
2500     case T_SHORT:
2501     case T_CHAR:
2502       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2503       break;
2504     case T_INT:
2505     case T_FLOAT:
2506       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2507       break;
2508     case T_LONG:
2509     case T_DOUBLE:
2510       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2511       break;
2512     default:
2513       assert(false,"Should not reach here.");
2514       break;
2515   }
2516 }
2517 
2518 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2519   assert(rscratch != noreg || always_reachable(src2), "missing");
2520 
2521   switch(typ) {
2522     case T_BOOLEAN:
2523     case T_BYTE:
2524       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2525       break;
2526     case T_CHAR:
2527     case T_SHORT:
2528       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2529       break;
2530     case T_INT:
2531     case T_FLOAT:
2532       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2533       break;
2534     case T_LONG:
2535     case T_DOUBLE:
2536       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2537       break;
2538     default:
2539       assert(false,"Should not reach here.");
2540       break;
2541   }
2542 }
2543 
2544 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2545   switch(typ) {
2546     case T_BYTE:
2547       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2548       break;
2549     case T_SHORT:
2550       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2551       break;
2552     case T_INT:
2553     case T_FLOAT:
2554       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2555       break;
2556     case T_LONG:
2557     case T_DOUBLE:
2558       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2559       break;
2560     default:
2561       assert(false,"Should not reach here.");
2562       break;
2563   }
2564 }
2565 
2566 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2567   assert(vlen_in_bytes <= 32, "");
2568   int esize = type2aelembytes(bt);
2569   if (vlen_in_bytes == 32) {
2570     assert(vtmp == xnoreg, "required.");
2571     if (esize >= 4) {
2572       vtestps(src1, src2, AVX_256bit);
2573     } else {
2574       vptest(src1, src2, AVX_256bit);
2575     }
2576     return;
2577   }
2578   if (vlen_in_bytes < 16) {
2579     // Duplicate the lower part to fill the whole register,
2580     // Don't need to do so for src2
2581     assert(vtmp != xnoreg, "required");
2582     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2583     pshufd(vtmp, src1, shuffle_imm);
2584   } else {
2585     assert(vtmp == xnoreg, "required");
2586     vtmp = src1;
2587   }
2588   if (esize >= 4 && VM_Version::supports_avx()) {
2589     vtestps(vtmp, src2, AVX_128bit);
2590   } else {
2591     ptest(vtmp, src2);
2592   }
2593 }
2594 
2595 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2596 #ifdef ASSERT
2597   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2598   bool is_bw_supported = VM_Version::supports_avx512bw();
2599   if (is_bw && !is_bw_supported) {
2600     assert(vlen_enc != Assembler::AVX_512bit, "required");
2601     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2602            "XMM register should be 0-15");
2603   }
2604 #endif // ASSERT
2605   switch (elem_bt) {
2606     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2607     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2608     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2609     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2610     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2611     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2612     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2613   }
2614 }
2615 
2616 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2617   assert(UseAVX >= 2, "required");
2618   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2619   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2620   if ((UseAVX > 2) &&
2621       (!is_bw || VM_Version::supports_avx512bw()) &&
2622       (!is_vl || VM_Version::supports_avx512vl())) {
2623     switch (elem_bt) {
2624       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2625       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2626       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2627       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2628       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2629     }
2630   } else {
2631     assert(vlen_enc != Assembler::AVX_512bit, "required");
2632     assert((dst->encoding() < 16),"XMM register should be 0-15");
2633     switch (elem_bt) {
2634       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2635       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2636       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2637       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2638       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2639       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2640       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2641     }
2642   }
2643 }
2644 
2645 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2646   switch (to_elem_bt) {
2647     case T_SHORT:
2648       vpmovsxbw(dst, src, vlen_enc);
2649       break;
2650     case T_INT:
2651       vpmovsxbd(dst, src, vlen_enc);
2652       break;
2653     case T_FLOAT:
2654       vpmovsxbd(dst, src, vlen_enc);
2655       vcvtdq2ps(dst, dst, vlen_enc);
2656       break;
2657     case T_LONG:
2658       vpmovsxbq(dst, src, vlen_enc);
2659       break;
2660     case T_DOUBLE: {
2661       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2662       vpmovsxbd(dst, src, mid_vlen_enc);
2663       vcvtdq2pd(dst, dst, vlen_enc);
2664       break;
2665     }
2666     default:
2667       fatal("Unsupported type %s", type2name(to_elem_bt));
2668       break;
2669   }
2670 }
2671 
2672 //-------------------------------------------------------------------------------------------
2673 
2674 // IndexOf for constant substrings with size >= 8 chars
2675 // which don't need to be loaded through stack.
2676 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2677                                          Register cnt1, Register cnt2,
2678                                          int int_cnt2,  Register result,
2679                                          XMMRegister vec, Register tmp,
2680                                          int ae) {
2681   ShortBranchVerifier sbv(this);
2682   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2683   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2684 
2685   // This method uses the pcmpestri instruction with bound registers
2686   //   inputs:
2687   //     xmm - substring
2688   //     rax - substring length (elements count)
2689   //     mem - scanned string
2690   //     rdx - string length (elements count)
2691   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2692   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2693   //   outputs:
2694   //     rcx - matched index in string
2695   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2696   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2697   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2698   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2699   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2700 
2701   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2702         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2703         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2704 
2705   // Note, inline_string_indexOf() generates checks:
2706   // if (substr.count > string.count) return -1;
2707   // if (substr.count == 0) return 0;
2708   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2709 
2710   // Load substring.
2711   if (ae == StrIntrinsicNode::UL) {
2712     pmovzxbw(vec, Address(str2, 0));
2713   } else {
2714     movdqu(vec, Address(str2, 0));
2715   }
2716   movl(cnt2, int_cnt2);
2717   movptr(result, str1); // string addr
2718 
2719   if (int_cnt2 > stride) {
2720     jmpb(SCAN_TO_SUBSTR);
2721 
2722     // Reload substr for rescan, this code
2723     // is executed only for large substrings (> 8 chars)
2724     bind(RELOAD_SUBSTR);
2725     if (ae == StrIntrinsicNode::UL) {
2726       pmovzxbw(vec, Address(str2, 0));
2727     } else {
2728       movdqu(vec, Address(str2, 0));
2729     }
2730     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2731 
2732     bind(RELOAD_STR);
2733     // We came here after the beginning of the substring was
2734     // matched but the rest of it was not so we need to search
2735     // again. Start from the next element after the previous match.
2736 
2737     // cnt2 is number of substring reminding elements and
2738     // cnt1 is number of string reminding elements when cmp failed.
2739     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2740     subl(cnt1, cnt2);
2741     addl(cnt1, int_cnt2);
2742     movl(cnt2, int_cnt2); // Now restore cnt2
2743 
2744     decrementl(cnt1);     // Shift to next element
2745     cmpl(cnt1, cnt2);
2746     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2747 
2748     addptr(result, (1<<scale1));
2749 
2750   } // (int_cnt2 > 8)
2751 
2752   // Scan string for start of substr in 16-byte vectors
2753   bind(SCAN_TO_SUBSTR);
2754   pcmpestri(vec, Address(result, 0), mode);
2755   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2756   subl(cnt1, stride);
2757   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2758   cmpl(cnt1, cnt2);
2759   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2760   addptr(result, 16);
2761   jmpb(SCAN_TO_SUBSTR);
2762 
2763   // Found a potential substr
2764   bind(FOUND_CANDIDATE);
2765   // Matched whole vector if first element matched (tmp(rcx) == 0).
2766   if (int_cnt2 == stride) {
2767     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2768   } else { // int_cnt2 > 8
2769     jccb(Assembler::overflow, FOUND_SUBSTR);
2770   }
2771   // After pcmpestri tmp(rcx) contains matched element index
2772   // Compute start addr of substr
2773   lea(result, Address(result, tmp, scale1));
2774 
2775   // Make sure string is still long enough
2776   subl(cnt1, tmp);
2777   cmpl(cnt1, cnt2);
2778   if (int_cnt2 == stride) {
2779     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2780   } else { // int_cnt2 > 8
2781     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2782   }
2783   // Left less then substring.
2784 
2785   bind(RET_NOT_FOUND);
2786   movl(result, -1);
2787   jmp(EXIT);
2788 
2789   if (int_cnt2 > stride) {
2790     // This code is optimized for the case when whole substring
2791     // is matched if its head is matched.
2792     bind(MATCH_SUBSTR_HEAD);
2793     pcmpestri(vec, Address(result, 0), mode);
2794     // Reload only string if does not match
2795     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2796 
2797     Label CONT_SCAN_SUBSTR;
2798     // Compare the rest of substring (> 8 chars).
2799     bind(FOUND_SUBSTR);
2800     // First 8 chars are already matched.
2801     negptr(cnt2);
2802     addptr(cnt2, stride);
2803 
2804     bind(SCAN_SUBSTR);
2805     subl(cnt1, stride);
2806     cmpl(cnt2, -stride); // Do not read beyond substring
2807     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2808     // Back-up strings to avoid reading beyond substring:
2809     // cnt1 = cnt1 - cnt2 + 8
2810     addl(cnt1, cnt2); // cnt2 is negative
2811     addl(cnt1, stride);
2812     movl(cnt2, stride); negptr(cnt2);
2813     bind(CONT_SCAN_SUBSTR);
2814     if (int_cnt2 < (int)G) {
2815       int tail_off1 = int_cnt2<<scale1;
2816       int tail_off2 = int_cnt2<<scale2;
2817       if (ae == StrIntrinsicNode::UL) {
2818         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2819       } else {
2820         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2821       }
2822       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2823     } else {
2824       // calculate index in register to avoid integer overflow (int_cnt2*2)
2825       movl(tmp, int_cnt2);
2826       addptr(tmp, cnt2);
2827       if (ae == StrIntrinsicNode::UL) {
2828         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2829       } else {
2830         movdqu(vec, Address(str2, tmp, scale2, 0));
2831       }
2832       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2833     }
2834     // Need to reload strings pointers if not matched whole vector
2835     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2836     addptr(cnt2, stride);
2837     jcc(Assembler::negative, SCAN_SUBSTR);
2838     // Fall through if found full substring
2839 
2840   } // (int_cnt2 > 8)
2841 
2842   bind(RET_FOUND);
2843   // Found result if we matched full small substring.
2844   // Compute substr offset
2845   subptr(result, str1);
2846   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2847     shrl(result, 1); // index
2848   }
2849   bind(EXIT);
2850 
2851 } // string_indexofC8
2852 
2853 // Small strings are loaded through stack if they cross page boundary.
2854 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2855                                        Register cnt1, Register cnt2,
2856                                        int int_cnt2,  Register result,
2857                                        XMMRegister vec, Register tmp,
2858                                        int ae) {
2859   ShortBranchVerifier sbv(this);
2860   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2861   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2862 
2863   //
2864   // int_cnt2 is length of small (< 8 chars) constant substring
2865   // or (-1) for non constant substring in which case its length
2866   // is in cnt2 register.
2867   //
2868   // Note, inline_string_indexOf() generates checks:
2869   // if (substr.count > string.count) return -1;
2870   // if (substr.count == 0) return 0;
2871   //
2872   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2873   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2874   // This method uses the pcmpestri instruction with bound registers
2875   //   inputs:
2876   //     xmm - substring
2877   //     rax - substring length (elements count)
2878   //     mem - scanned string
2879   //     rdx - string length (elements count)
2880   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2881   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2882   //   outputs:
2883   //     rcx - matched index in string
2884   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2885   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2886   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2887   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2888 
2889   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2890         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2891         FOUND_CANDIDATE;
2892 
2893   { //========================================================
2894     // We don't know where these strings are located
2895     // and we can't read beyond them. Load them through stack.
2896     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2897 
2898     movptr(tmp, rsp); // save old SP
2899 
2900     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2901       if (int_cnt2 == (1>>scale2)) { // One byte
2902         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2903         load_unsigned_byte(result, Address(str2, 0));
2904         movdl(vec, result); // move 32 bits
2905       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2906         // Not enough header space in 32-bit VM: 12+3 = 15.
2907         movl(result, Address(str2, -1));
2908         shrl(result, 8);
2909         movdl(vec, result); // move 32 bits
2910       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2911         load_unsigned_short(result, Address(str2, 0));
2912         movdl(vec, result); // move 32 bits
2913       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2914         movdl(vec, Address(str2, 0)); // move 32 bits
2915       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2916         movq(vec, Address(str2, 0));  // move 64 bits
2917       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2918         // Array header size is 12 bytes in 32-bit VM
2919         // + 6 bytes for 3 chars == 18 bytes,
2920         // enough space to load vec and shift.
2921         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2922         if (ae == StrIntrinsicNode::UL) {
2923           int tail_off = int_cnt2-8;
2924           pmovzxbw(vec, Address(str2, tail_off));
2925           psrldq(vec, -2*tail_off);
2926         }
2927         else {
2928           int tail_off = int_cnt2*(1<<scale2);
2929           movdqu(vec, Address(str2, tail_off-16));
2930           psrldq(vec, 16-tail_off);
2931         }
2932       }
2933     } else { // not constant substring
2934       cmpl(cnt2, stride);
2935       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2936 
2937       // We can read beyond string if srt+16 does not cross page boundary
2938       // since heaps are aligned and mapped by pages.
2939       assert(os::vm_page_size() < (int)G, "default page should be small");
2940       movl(result, str2); // We need only low 32 bits
2941       andl(result, ((int)os::vm_page_size()-1));
2942       cmpl(result, ((int)os::vm_page_size()-16));
2943       jccb(Assembler::belowEqual, CHECK_STR);
2944 
2945       // Move small strings to stack to allow load 16 bytes into vec.
2946       subptr(rsp, 16);
2947       int stk_offset = wordSize-(1<<scale2);
2948       push(cnt2);
2949 
2950       bind(COPY_SUBSTR);
2951       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2952         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2953         movb(Address(rsp, cnt2, scale2, stk_offset), result);
2954       } else if (ae == StrIntrinsicNode::UU) {
2955         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2956         movw(Address(rsp, cnt2, scale2, stk_offset), result);
2957       }
2958       decrement(cnt2);
2959       jccb(Assembler::notZero, COPY_SUBSTR);
2960 
2961       pop(cnt2);
2962       movptr(str2, rsp);  // New substring address
2963     } // non constant
2964 
2965     bind(CHECK_STR);
2966     cmpl(cnt1, stride);
2967     jccb(Assembler::aboveEqual, BIG_STRINGS);
2968 
2969     // Check cross page boundary.
2970     movl(result, str1); // We need only low 32 bits
2971     andl(result, ((int)os::vm_page_size()-1));
2972     cmpl(result, ((int)os::vm_page_size()-16));
2973     jccb(Assembler::belowEqual, BIG_STRINGS);
2974 
2975     subptr(rsp, 16);
2976     int stk_offset = -(1<<scale1);
2977     if (int_cnt2 < 0) { // not constant
2978       push(cnt2);
2979       stk_offset += wordSize;
2980     }
2981     movl(cnt2, cnt1);
2982 
2983     bind(COPY_STR);
2984     if (ae == StrIntrinsicNode::LL) {
2985       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2986       movb(Address(rsp, cnt2, scale1, stk_offset), result);
2987     } else {
2988       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2989       movw(Address(rsp, cnt2, scale1, stk_offset), result);
2990     }
2991     decrement(cnt2);
2992     jccb(Assembler::notZero, COPY_STR);
2993 
2994     if (int_cnt2 < 0) { // not constant
2995       pop(cnt2);
2996     }
2997     movptr(str1, rsp);  // New string address
2998 
2999     bind(BIG_STRINGS);
3000     // Load substring.
3001     if (int_cnt2 < 0) { // -1
3002       if (ae == StrIntrinsicNode::UL) {
3003         pmovzxbw(vec, Address(str2, 0));
3004       } else {
3005         movdqu(vec, Address(str2, 0));
3006       }
3007       push(cnt2);       // substr count
3008       push(str2);       // substr addr
3009       push(str1);       // string addr
3010     } else {
3011       // Small (< 8 chars) constant substrings are loaded already.
3012       movl(cnt2, int_cnt2);
3013     }
3014     push(tmp);  // original SP
3015 
3016   } // Finished loading
3017 
3018   //========================================================
3019   // Start search
3020   //
3021 
3022   movptr(result, str1); // string addr
3023 
3024   if (int_cnt2  < 0) {  // Only for non constant substring
3025     jmpb(SCAN_TO_SUBSTR);
3026 
3027     // SP saved at sp+0
3028     // String saved at sp+1*wordSize
3029     // Substr saved at sp+2*wordSize
3030     // Substr count saved at sp+3*wordSize
3031 
3032     // Reload substr for rescan, this code
3033     // is executed only for large substrings (> 8 chars)
3034     bind(RELOAD_SUBSTR);
3035     movptr(str2, Address(rsp, 2*wordSize));
3036     movl(cnt2, Address(rsp, 3*wordSize));
3037     if (ae == StrIntrinsicNode::UL) {
3038       pmovzxbw(vec, Address(str2, 0));
3039     } else {
3040       movdqu(vec, Address(str2, 0));
3041     }
3042     // We came here after the beginning of the substring was
3043     // matched but the rest of it was not so we need to search
3044     // again. Start from the next element after the previous match.
3045     subptr(str1, result); // Restore counter
3046     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3047       shrl(str1, 1);
3048     }
3049     addl(cnt1, str1);
3050     decrementl(cnt1);   // Shift to next element
3051     cmpl(cnt1, cnt2);
3052     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3053 
3054     addptr(result, (1<<scale1));
3055   } // non constant
3056 
3057   // Scan string for start of substr in 16-byte vectors
3058   bind(SCAN_TO_SUBSTR);
3059   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3060   pcmpestri(vec, Address(result, 0), mode);
3061   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3062   subl(cnt1, stride);
3063   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3064   cmpl(cnt1, cnt2);
3065   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3066   addptr(result, 16);
3067 
3068   bind(ADJUST_STR);
3069   cmpl(cnt1, stride); // Do not read beyond string
3070   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3071   // Back-up string to avoid reading beyond string.
3072   lea(result, Address(result, cnt1, scale1, -16));
3073   movl(cnt1, stride);
3074   jmpb(SCAN_TO_SUBSTR);
3075 
3076   // Found a potential substr
3077   bind(FOUND_CANDIDATE);
3078   // After pcmpestri tmp(rcx) contains matched element index
3079 
3080   // Make sure string is still long enough
3081   subl(cnt1, tmp);
3082   cmpl(cnt1, cnt2);
3083   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3084   // Left less then substring.
3085 
3086   bind(RET_NOT_FOUND);
3087   movl(result, -1);
3088   jmp(CLEANUP);
3089 
3090   bind(FOUND_SUBSTR);
3091   // Compute start addr of substr
3092   lea(result, Address(result, tmp, scale1));
3093   if (int_cnt2 > 0) { // Constant substring
3094     // Repeat search for small substring (< 8 chars)
3095     // from new point without reloading substring.
3096     // Have to check that we don't read beyond string.
3097     cmpl(tmp, stride-int_cnt2);
3098     jccb(Assembler::greater, ADJUST_STR);
3099     // Fall through if matched whole substring.
3100   } else { // non constant
3101     assert(int_cnt2 == -1, "should be != 0");
3102 
3103     addl(tmp, cnt2);
3104     // Found result if we matched whole substring.
3105     cmpl(tmp, stride);
3106     jcc(Assembler::lessEqual, RET_FOUND);
3107 
3108     // Repeat search for small substring (<= 8 chars)
3109     // from new point 'str1' without reloading substring.
3110     cmpl(cnt2, stride);
3111     // Have to check that we don't read beyond string.
3112     jccb(Assembler::lessEqual, ADJUST_STR);
3113 
3114     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3115     // Compare the rest of substring (> 8 chars).
3116     movptr(str1, result);
3117 
3118     cmpl(tmp, cnt2);
3119     // First 8 chars are already matched.
3120     jccb(Assembler::equal, CHECK_NEXT);
3121 
3122     bind(SCAN_SUBSTR);
3123     pcmpestri(vec, Address(str1, 0), mode);
3124     // Need to reload strings pointers if not matched whole vector
3125     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3126 
3127     bind(CHECK_NEXT);
3128     subl(cnt2, stride);
3129     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3130     addptr(str1, 16);
3131     if (ae == StrIntrinsicNode::UL) {
3132       addptr(str2, 8);
3133     } else {
3134       addptr(str2, 16);
3135     }
3136     subl(cnt1, stride);
3137     cmpl(cnt2, stride); // Do not read beyond substring
3138     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3139     // Back-up strings to avoid reading beyond substring.
3140 
3141     if (ae == StrIntrinsicNode::UL) {
3142       lea(str2, Address(str2, cnt2, scale2, -8));
3143       lea(str1, Address(str1, cnt2, scale1, -16));
3144     } else {
3145       lea(str2, Address(str2, cnt2, scale2, -16));
3146       lea(str1, Address(str1, cnt2, scale1, -16));
3147     }
3148     subl(cnt1, cnt2);
3149     movl(cnt2, stride);
3150     addl(cnt1, stride);
3151     bind(CONT_SCAN_SUBSTR);
3152     if (ae == StrIntrinsicNode::UL) {
3153       pmovzxbw(vec, Address(str2, 0));
3154     } else {
3155       movdqu(vec, Address(str2, 0));
3156     }
3157     jmp(SCAN_SUBSTR);
3158 
3159     bind(RET_FOUND_LONG);
3160     movptr(str1, Address(rsp, wordSize));
3161   } // non constant
3162 
3163   bind(RET_FOUND);
3164   // Compute substr offset
3165   subptr(result, str1);
3166   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3167     shrl(result, 1); // index
3168   }
3169   bind(CLEANUP);
3170   pop(rsp); // restore SP
3171 
3172 } // string_indexof
3173 
3174 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3175                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3176   ShortBranchVerifier sbv(this);
3177   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3178 
3179   int stride = 8;
3180 
3181   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3182         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3183         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3184         FOUND_SEQ_CHAR, DONE_LABEL;
3185 
3186   movptr(result, str1);
3187   if (UseAVX >= 2) {
3188     cmpl(cnt1, stride);
3189     jcc(Assembler::less, SCAN_TO_CHAR);
3190     cmpl(cnt1, 2*stride);
3191     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3192     movdl(vec1, ch);
3193     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3194     vpxor(vec2, vec2);
3195     movl(tmp, cnt1);
3196     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3197     andl(cnt1,0x0000000F);  //tail count (in chars)
3198 
3199     bind(SCAN_TO_16_CHAR_LOOP);
3200     vmovdqu(vec3, Address(result, 0));
3201     vpcmpeqw(vec3, vec3, vec1, 1);
3202     vptest(vec2, vec3);
3203     jcc(Assembler::carryClear, FOUND_CHAR);
3204     addptr(result, 32);
3205     subl(tmp, 2*stride);
3206     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3207     jmp(SCAN_TO_8_CHAR);
3208     bind(SCAN_TO_8_CHAR_INIT);
3209     movdl(vec1, ch);
3210     pshuflw(vec1, vec1, 0x00);
3211     pshufd(vec1, vec1, 0);
3212     pxor(vec2, vec2);
3213   }
3214   bind(SCAN_TO_8_CHAR);
3215   cmpl(cnt1, stride);
3216   jcc(Assembler::less, SCAN_TO_CHAR);
3217   if (UseAVX < 2) {
3218     movdl(vec1, ch);
3219     pshuflw(vec1, vec1, 0x00);
3220     pshufd(vec1, vec1, 0);
3221     pxor(vec2, vec2);
3222   }
3223   movl(tmp, cnt1);
3224   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3225   andl(cnt1,0x00000007);  //tail count (in chars)
3226 
3227   bind(SCAN_TO_8_CHAR_LOOP);
3228   movdqu(vec3, Address(result, 0));
3229   pcmpeqw(vec3, vec1);
3230   ptest(vec2, vec3);
3231   jcc(Assembler::carryClear, FOUND_CHAR);
3232   addptr(result, 16);
3233   subl(tmp, stride);
3234   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3235   bind(SCAN_TO_CHAR);
3236   testl(cnt1, cnt1);
3237   jcc(Assembler::zero, RET_NOT_FOUND);
3238   bind(SCAN_TO_CHAR_LOOP);
3239   load_unsigned_short(tmp, Address(result, 0));
3240   cmpl(ch, tmp);
3241   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3242   addptr(result, 2);
3243   subl(cnt1, 1);
3244   jccb(Assembler::zero, RET_NOT_FOUND);
3245   jmp(SCAN_TO_CHAR_LOOP);
3246 
3247   bind(RET_NOT_FOUND);
3248   movl(result, -1);
3249   jmpb(DONE_LABEL);
3250 
3251   bind(FOUND_CHAR);
3252   if (UseAVX >= 2) {
3253     vpmovmskb(tmp, vec3);
3254   } else {
3255     pmovmskb(tmp, vec3);
3256   }
3257   bsfl(ch, tmp);
3258   addptr(result, ch);
3259 
3260   bind(FOUND_SEQ_CHAR);
3261   subptr(result, str1);
3262   shrl(result, 1);
3263 
3264   bind(DONE_LABEL);
3265 } // string_indexof_char
3266 
3267 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3268                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3269   ShortBranchVerifier sbv(this);
3270   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3271 
3272   int stride = 16;
3273 
3274   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3275         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3276         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3277         FOUND_SEQ_CHAR, DONE_LABEL;
3278 
3279   movptr(result, str1);
3280   if (UseAVX >= 2) {
3281     cmpl(cnt1, stride);
3282     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3283     cmpl(cnt1, stride*2);
3284     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3285     movdl(vec1, ch);
3286     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3287     vpxor(vec2, vec2);
3288     movl(tmp, cnt1);
3289     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3290     andl(cnt1,0x0000001F);  //tail count (in chars)
3291 
3292     bind(SCAN_TO_32_CHAR_LOOP);
3293     vmovdqu(vec3, Address(result, 0));
3294     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3295     vptest(vec2, vec3);
3296     jcc(Assembler::carryClear, FOUND_CHAR);
3297     addptr(result, 32);
3298     subl(tmp, stride*2);
3299     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3300     jmp(SCAN_TO_16_CHAR);
3301 
3302     bind(SCAN_TO_16_CHAR_INIT);
3303     movdl(vec1, ch);
3304     pxor(vec2, vec2);
3305     pshufb(vec1, vec2);
3306   }
3307 
3308   bind(SCAN_TO_16_CHAR);
3309   cmpl(cnt1, stride);
3310   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3311   if (UseAVX < 2) {
3312     movdl(vec1, ch);
3313     pxor(vec2, vec2);
3314     pshufb(vec1, vec2);
3315   }
3316   movl(tmp, cnt1);
3317   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3318   andl(cnt1,0x0000000F);  //tail count (in bytes)
3319 
3320   bind(SCAN_TO_16_CHAR_LOOP);
3321   movdqu(vec3, Address(result, 0));
3322   pcmpeqb(vec3, vec1);
3323   ptest(vec2, vec3);
3324   jcc(Assembler::carryClear, FOUND_CHAR);
3325   addptr(result, 16);
3326   subl(tmp, stride);
3327   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3328 
3329   bind(SCAN_TO_CHAR_INIT);
3330   testl(cnt1, cnt1);
3331   jcc(Assembler::zero, RET_NOT_FOUND);
3332   bind(SCAN_TO_CHAR_LOOP);
3333   load_unsigned_byte(tmp, Address(result, 0));
3334   cmpl(ch, tmp);
3335   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3336   addptr(result, 1);
3337   subl(cnt1, 1);
3338   jccb(Assembler::zero, RET_NOT_FOUND);
3339   jmp(SCAN_TO_CHAR_LOOP);
3340 
3341   bind(RET_NOT_FOUND);
3342   movl(result, -1);
3343   jmpb(DONE_LABEL);
3344 
3345   bind(FOUND_CHAR);
3346   if (UseAVX >= 2) {
3347     vpmovmskb(tmp, vec3);
3348   } else {
3349     pmovmskb(tmp, vec3);
3350   }
3351   bsfl(ch, tmp);
3352   addptr(result, ch);
3353 
3354   bind(FOUND_SEQ_CHAR);
3355   subptr(result, str1);
3356 
3357   bind(DONE_LABEL);
3358 } // stringL_indexof_char
3359 
3360 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3361   switch (eltype) {
3362   case T_BOOLEAN: return sizeof(jboolean);
3363   case T_BYTE:  return sizeof(jbyte);
3364   case T_SHORT: return sizeof(jshort);
3365   case T_CHAR:  return sizeof(jchar);
3366   case T_INT:   return sizeof(jint);
3367   default:
3368     ShouldNotReachHere();
3369     return -1;
3370   }
3371 }
3372 
3373 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3374   switch (eltype) {
3375   // T_BOOLEAN used as surrogate for unsigned byte
3376   case T_BOOLEAN: movzbl(dst, src);   break;
3377   case T_BYTE:    movsbl(dst, src);   break;
3378   case T_SHORT:   movswl(dst, src);   break;
3379   case T_CHAR:    movzwl(dst, src);   break;
3380   case T_INT:     movl(dst, src);     break;
3381   default:
3382     ShouldNotReachHere();
3383   }
3384 }
3385 
3386 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3387   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3388 }
3389 
3390 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3391   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3392 }
3393 
3394 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3395   const int vlen = Assembler::AVX_256bit;
3396   switch (eltype) {
3397   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3398   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3399   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3400   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3401   case T_INT:
3402     // do nothing
3403     break;
3404   default:
3405     ShouldNotReachHere();
3406   }
3407 }
3408 
3409 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3410                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3411                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3412                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3413                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3414                                         BasicType eltype) {
3415   ShortBranchVerifier sbv(this);
3416   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3417   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3418   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3419 
3420   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3421         SHORT_UNROLLED_LOOP_EXIT,
3422         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3423         UNROLLED_VECTOR_LOOP_BEGIN,
3424         END;
3425   switch (eltype) {
3426   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3427   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3428   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3429   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3430   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3431   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3432   }
3433 
3434   // For "renaming" for readibility of the code
3435   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3436                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3437                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3438 
3439   const int elsize = arrays_hashcode_elsize(eltype);
3440 
3441   /*
3442     if (cnt1 >= 2) {
3443       if (cnt1 >= 32) {
3444         UNROLLED VECTOR LOOP
3445       }
3446       UNROLLED SCALAR LOOP
3447     }
3448     SINGLE SCALAR
3449    */
3450 
3451   cmpl(cnt1, 32);
3452   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3453 
3454   // cnt1 >= 32 && generate_vectorized_loop
3455   xorl(index, index);
3456 
3457   // vresult = IntVector.zero(I256);
3458   for (int idx = 0; idx < 4; idx++) {
3459     vpxor(vresult[idx], vresult[idx]);
3460   }
3461   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3462   Register bound = tmp2;
3463   Register next = tmp3;
3464   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3465   movl(next, Address(tmp2, 0));
3466   movdl(vnext, next);
3467   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3468 
3469   // index = 0;
3470   // bound = cnt1 & ~(32 - 1);
3471   movl(bound, cnt1);
3472   andl(bound, ~(32 - 1));
3473   // for (; index < bound; index += 32) {
3474   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3475   // result *= next;
3476   imull(result, next);
3477   // loop fission to upfront the cost of fetching from memory, OOO execution
3478   // can then hopefully do a better job of prefetching
3479   for (int idx = 0; idx < 4; idx++) {
3480     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3481   }
3482   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3483   for (int idx = 0; idx < 4; idx++) {
3484     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3485     arrays_hashcode_elvcast(vtmp[idx], eltype);
3486     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3487   }
3488   // index += 32;
3489   addl(index, 32);
3490   // index < bound;
3491   cmpl(index, bound);
3492   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3493   // }
3494 
3495   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3496   subl(cnt1, bound);
3497   // release bound
3498 
3499   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3500   for (int idx = 0; idx < 4; idx++) {
3501     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3502     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3503     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3504   }
3505   // result += vresult.reduceLanes(ADD);
3506   for (int idx = 0; idx < 4; idx++) {
3507     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3508   }
3509 
3510   // } else if (cnt1 < 32) {
3511 
3512   bind(SHORT_UNROLLED_BEGIN);
3513   // int i = 1;
3514   movl(index, 1);
3515   cmpl(index, cnt1);
3516   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3517 
3518   // for (; i < cnt1 ; i += 2) {
3519   bind(SHORT_UNROLLED_LOOP_BEGIN);
3520   movl(tmp3, 961);
3521   imull(result, tmp3);
3522   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3523   movl(tmp3, tmp2);
3524   shll(tmp3, 5);
3525   subl(tmp3, tmp2);
3526   addl(result, tmp3);
3527   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3528   addl(result, tmp3);
3529   addl(index, 2);
3530   cmpl(index, cnt1);
3531   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3532 
3533   // }
3534   // if (i >= cnt1) {
3535   bind(SHORT_UNROLLED_LOOP_EXIT);
3536   jccb(Assembler::greater, END);
3537   movl(tmp2, result);
3538   shll(result, 5);
3539   subl(result, tmp2);
3540   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3541   addl(result, tmp3);
3542   // }
3543   bind(END);
3544 
3545   BLOCK_COMMENT("} // arrays_hashcode");
3546 
3547 } // arrays_hashcode
3548 
3549 // helper function for string_compare
3550 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3551                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3552                                            Address::ScaleFactor scale2, Register index, int ae) {
3553   if (ae == StrIntrinsicNode::LL) {
3554     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3555     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3556   } else if (ae == StrIntrinsicNode::UU) {
3557     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3558     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3559   } else {
3560     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3561     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3562   }
3563 }
3564 
3565 // Compare strings, used for char[] and byte[].
3566 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3567                                        Register cnt1, Register cnt2, Register result,
3568                                        XMMRegister vec1, int ae, KRegister mask) {
3569   ShortBranchVerifier sbv(this);
3570   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3571   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only AVX3
3572   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3573   int stride2x2 = 0x40;
3574   Address::ScaleFactor scale = Address::no_scale;
3575   Address::ScaleFactor scale1 = Address::no_scale;
3576   Address::ScaleFactor scale2 = Address::no_scale;
3577 
3578   if (ae != StrIntrinsicNode::LL) {
3579     stride2x2 = 0x20;
3580   }
3581 
3582   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3583     shrl(cnt2, 1);
3584   }
3585   // Compute the minimum of the string lengths and the
3586   // difference of the string lengths (stack).
3587   // Do the conditional move stuff
3588   movl(result, cnt1);
3589   subl(cnt1, cnt2);
3590   push(cnt1);
3591   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3592 
3593   // Is the minimum length zero?
3594   testl(cnt2, cnt2);
3595   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3596   if (ae == StrIntrinsicNode::LL) {
3597     // Load first bytes
3598     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3599     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3600   } else if (ae == StrIntrinsicNode::UU) {
3601     // Load first characters
3602     load_unsigned_short(result, Address(str1, 0));
3603     load_unsigned_short(cnt1, Address(str2, 0));
3604   } else {
3605     load_unsigned_byte(result, Address(str1, 0));
3606     load_unsigned_short(cnt1, Address(str2, 0));
3607   }
3608   subl(result, cnt1);
3609   jcc(Assembler::notZero,  POP_LABEL);
3610 
3611   if (ae == StrIntrinsicNode::UU) {
3612     // Divide length by 2 to get number of chars
3613     shrl(cnt2, 1);
3614   }
3615   cmpl(cnt2, 1);
3616   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3617 
3618   // Check if the strings start at the same location and setup scale and stride
3619   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3620     cmpptr(str1, str2);
3621     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3622     if (ae == StrIntrinsicNode::LL) {
3623       scale = Address::times_1;
3624       stride = 16;
3625     } else {
3626       scale = Address::times_2;
3627       stride = 8;
3628     }
3629   } else {
3630     scale1 = Address::times_1;
3631     scale2 = Address::times_2;
3632     // scale not used
3633     stride = 8;
3634   }
3635 
3636   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3637     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3638     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3639     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3640     Label COMPARE_TAIL_LONG;
3641     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only AVX3
3642 
3643     int pcmpmask = 0x19;
3644     if (ae == StrIntrinsicNode::LL) {
3645       pcmpmask &= ~0x01;
3646     }
3647 
3648     // Setup to compare 16-chars (32-bytes) vectors,
3649     // start from first character again because it has aligned address.
3650     if (ae == StrIntrinsicNode::LL) {
3651       stride2 = 32;
3652     } else {
3653       stride2 = 16;
3654     }
3655     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3656       adr_stride = stride << scale;
3657     } else {
3658       adr_stride1 = 8;  //stride << scale1;
3659       adr_stride2 = 16; //stride << scale2;
3660     }
3661 
3662     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3663     // rax and rdx are used by pcmpestri as elements counters
3664     movl(result, cnt2);
3665     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3666     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3667 
3668     // fast path : compare first 2 8-char vectors.
3669     bind(COMPARE_16_CHARS);
3670     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3671       movdqu(vec1, Address(str1, 0));
3672     } else {
3673       pmovzxbw(vec1, Address(str1, 0));
3674     }
3675     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3676     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3677 
3678     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3679       movdqu(vec1, Address(str1, adr_stride));
3680       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3681     } else {
3682       pmovzxbw(vec1, Address(str1, adr_stride1));
3683       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3684     }
3685     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3686     addl(cnt1, stride);
3687 
3688     // Compare the characters at index in cnt1
3689     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3690     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3691     subl(result, cnt2);
3692     jmp(POP_LABEL);
3693 
3694     // Setup the registers to start vector comparison loop
3695     bind(COMPARE_WIDE_VECTORS);
3696     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3697       lea(str1, Address(str1, result, scale));
3698       lea(str2, Address(str2, result, scale));
3699     } else {
3700       lea(str1, Address(str1, result, scale1));
3701       lea(str2, Address(str2, result, scale2));
3702     }
3703     subl(result, stride2);
3704     subl(cnt2, stride2);
3705     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3706     negptr(result);
3707 
3708     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3709     bind(COMPARE_WIDE_VECTORS_LOOP);
3710 
3711     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3712       cmpl(cnt2, stride2x2);
3713       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3714       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3715       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3716 
3717       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3718       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3719         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3720         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3721       } else {
3722         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3723         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3724       }
3725       kortestql(mask, mask);
3726       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3727       addptr(result, stride2x2);  // update since we already compared at this addr
3728       subl(cnt2, stride2x2);      // and sub the size too
3729       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3730 
3731       vpxor(vec1, vec1);
3732       jmpb(COMPARE_WIDE_TAIL);
3733     }//if (VM_Version::supports_avx512vlbw())
3734 
3735     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3736     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3737       vmovdqu(vec1, Address(str1, result, scale));
3738       vpxor(vec1, Address(str2, result, scale));
3739     } else {
3740       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3741       vpxor(vec1, Address(str2, result, scale2));
3742     }
3743     vptest(vec1, vec1);
3744     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3745     addptr(result, stride2);
3746     subl(cnt2, stride2);
3747     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3748     // clean upper bits of YMM registers
3749     vpxor(vec1, vec1);
3750 
3751     // compare wide vectors tail
3752     bind(COMPARE_WIDE_TAIL);
3753     testptr(result, result);
3754     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3755 
3756     movl(result, stride2);
3757     movl(cnt2, result);
3758     negptr(result);
3759     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3760 
3761     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3762     bind(VECTOR_NOT_EQUAL);
3763     // clean upper bits of YMM registers
3764     vpxor(vec1, vec1);
3765     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3766       lea(str1, Address(str1, result, scale));
3767       lea(str2, Address(str2, result, scale));
3768     } else {
3769       lea(str1, Address(str1, result, scale1));
3770       lea(str2, Address(str2, result, scale2));
3771     }
3772     jmp(COMPARE_16_CHARS);
3773 
3774     // Compare tail chars, length between 1 to 15 chars
3775     bind(COMPARE_TAIL_LONG);
3776     movl(cnt2, result);
3777     cmpl(cnt2, stride);
3778     jcc(Assembler::less, COMPARE_SMALL_STR);
3779 
3780     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3781       movdqu(vec1, Address(str1, 0));
3782     } else {
3783       pmovzxbw(vec1, Address(str1, 0));
3784     }
3785     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3786     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3787     subptr(cnt2, stride);
3788     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3789     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3790       lea(str1, Address(str1, result, scale));
3791       lea(str2, Address(str2, result, scale));
3792     } else {
3793       lea(str1, Address(str1, result, scale1));
3794       lea(str2, Address(str2, result, scale2));
3795     }
3796     negptr(cnt2);
3797     jmpb(WHILE_HEAD_LABEL);
3798 
3799     bind(COMPARE_SMALL_STR);
3800   } else if (UseSSE42Intrinsics) {
3801     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3802     int pcmpmask = 0x19;
3803     // Setup to compare 8-char (16-byte) vectors,
3804     // start from first character again because it has aligned address.
3805     movl(result, cnt2);
3806     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3807     if (ae == StrIntrinsicNode::LL) {
3808       pcmpmask &= ~0x01;
3809     }
3810     jcc(Assembler::zero, COMPARE_TAIL);
3811     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3812       lea(str1, Address(str1, result, scale));
3813       lea(str2, Address(str2, result, scale));
3814     } else {
3815       lea(str1, Address(str1, result, scale1));
3816       lea(str2, Address(str2, result, scale2));
3817     }
3818     negptr(result);
3819 
3820     // pcmpestri
3821     //   inputs:
3822     //     vec1- substring
3823     //     rax - negative string length (elements count)
3824     //     mem - scanned string
3825     //     rdx - string length (elements count)
3826     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3827     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3828     //   outputs:
3829     //     rcx - first mismatched element index
3830     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3831 
3832     bind(COMPARE_WIDE_VECTORS);
3833     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3834       movdqu(vec1, Address(str1, result, scale));
3835       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3836     } else {
3837       pmovzxbw(vec1, Address(str1, result, scale1));
3838       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3839     }
3840     // After pcmpestri cnt1(rcx) contains mismatched element index
3841 
3842     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3843     addptr(result, stride);
3844     subptr(cnt2, stride);
3845     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3846 
3847     // compare wide vectors tail
3848     testptr(result, result);
3849     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3850 
3851     movl(cnt2, stride);
3852     movl(result, stride);
3853     negptr(result);
3854     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3855       movdqu(vec1, Address(str1, result, scale));
3856       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3857     } else {
3858       pmovzxbw(vec1, Address(str1, result, scale1));
3859       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3860     }
3861     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3862 
3863     // Mismatched characters in the vectors
3864     bind(VECTOR_NOT_EQUAL);
3865     addptr(cnt1, result);
3866     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3867     subl(result, cnt2);
3868     jmpb(POP_LABEL);
3869 
3870     bind(COMPARE_TAIL); // limit is zero
3871     movl(cnt2, result);
3872     // Fallthru to tail compare
3873   }
3874   // Shift str2 and str1 to the end of the arrays, negate min
3875   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3876     lea(str1, Address(str1, cnt2, scale));
3877     lea(str2, Address(str2, cnt2, scale));
3878   } else {
3879     lea(str1, Address(str1, cnt2, scale1));
3880     lea(str2, Address(str2, cnt2, scale2));
3881   }
3882   decrementl(cnt2);  // first character was compared already
3883   negptr(cnt2);
3884 
3885   // Compare the rest of the elements
3886   bind(WHILE_HEAD_LABEL);
3887   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3888   subl(result, cnt1);
3889   jccb(Assembler::notZero, POP_LABEL);
3890   increment(cnt2);
3891   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3892 
3893   // Strings are equal up to min length.  Return the length difference.
3894   bind(LENGTH_DIFF_LABEL);
3895   pop(result);
3896   if (ae == StrIntrinsicNode::UU) {
3897     // Divide diff by 2 to get number of chars
3898     sarl(result, 1);
3899   }
3900   jmpb(DONE_LABEL);
3901 
3902   if (VM_Version::supports_avx512vlbw()) {
3903 
3904     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3905 
3906     kmovql(cnt1, mask);
3907     notq(cnt1);
3908     bsfq(cnt2, cnt1);
3909     if (ae != StrIntrinsicNode::LL) {
3910       // Divide diff by 2 to get number of chars
3911       sarl(cnt2, 1);
3912     }
3913     addq(result, cnt2);
3914     if (ae == StrIntrinsicNode::LL) {
3915       load_unsigned_byte(cnt1, Address(str2, result));
3916       load_unsigned_byte(result, Address(str1, result));
3917     } else if (ae == StrIntrinsicNode::UU) {
3918       load_unsigned_short(cnt1, Address(str2, result, scale));
3919       load_unsigned_short(result, Address(str1, result, scale));
3920     } else {
3921       load_unsigned_short(cnt1, Address(str2, result, scale2));
3922       load_unsigned_byte(result, Address(str1, result, scale1));
3923     }
3924     subl(result, cnt1);
3925     jmpb(POP_LABEL);
3926   }//if (VM_Version::supports_avx512vlbw())
3927 
3928   // Discard the stored length difference
3929   bind(POP_LABEL);
3930   pop(cnt1);
3931 
3932   // That's it
3933   bind(DONE_LABEL);
3934   if(ae == StrIntrinsicNode::UL) {
3935     negl(result);
3936   }
3937 
3938 }
3939 
3940 // Search for Non-ASCII character (Negative byte value) in a byte array,
3941 // return the index of the first such character, otherwise the length
3942 // of the array segment searched.
3943 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3944 //   @IntrinsicCandidate
3945 //   public static int countPositives(byte[] ba, int off, int len) {
3946 //     for (int i = off; i < off + len; i++) {
3947 //       if (ba[i] < 0) {
3948 //         return i - off;
3949 //       }
3950 //     }
3951 //     return len;
3952 //   }
3953 void C2_MacroAssembler::count_positives(Register ary1, Register len,
3954   Register result, Register tmp1,
3955   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3956   // rsi: byte array
3957   // rcx: len
3958   // rax: result
3959   ShortBranchVerifier sbv(this);
3960   assert_different_registers(ary1, len, result, tmp1);
3961   assert_different_registers(vec1, vec2);
3962   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3963 
3964   movl(result, len); // copy
3965   // len == 0
3966   testl(len, len);
3967   jcc(Assembler::zero, DONE);
3968 
3969   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3970     VM_Version::supports_avx512vlbw() &&
3971     VM_Version::supports_bmi2()) {
3972 
3973     Label test_64_loop, test_tail, BREAK_LOOP;
3974     movl(tmp1, len);
3975     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3976 
3977     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
3978     andl(len,  0xffffffc0); // vector count (in chars)
3979     jccb(Assembler::zero, test_tail);
3980 
3981     lea(ary1, Address(ary1, len, Address::times_1));
3982     negptr(len);
3983 
3984     bind(test_64_loop);
3985     // Check whether our 64 elements of size byte contain negatives
3986     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3987     kortestql(mask1, mask1);
3988     jcc(Assembler::notZero, BREAK_LOOP);
3989 
3990     addptr(len, 64);
3991     jccb(Assembler::notZero, test_64_loop);
3992 
3993     bind(test_tail);
3994     // bail out when there is nothing to be done
3995     testl(tmp1, -1);
3996     jcc(Assembler::zero, DONE);
3997 
3998 
3999     // check the tail for absense of negatives
4000     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4001     {
4002       Register tmp3_aliased = len;
4003       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4004       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4005       notq(tmp3_aliased);
4006       kmovql(mask2, tmp3_aliased);
4007     }
4008 
4009     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4010     ktestq(mask1, mask2);
4011     jcc(Assembler::zero, DONE);
4012 
4013     // do a full check for negative registers in the tail
4014     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4015                      // ary1 already pointing to the right place
4016     jmpb(TAIL_START);
4017 
4018     bind(BREAK_LOOP);
4019     // At least one byte in the last 64 byte block was negative.
4020     // Set up to look at the last 64 bytes as if they were a tail
4021     lea(ary1, Address(ary1, len, Address::times_1));
4022     addptr(result, len);
4023     // Ignore the very last byte: if all others are positive,
4024     // it must be negative, so we can skip right to the 2+1 byte
4025     // end comparison at this point
4026     orl(result, 63);
4027     movl(len, 63);
4028     // Fallthru to tail compare
4029   } else {
4030 
4031     if (UseAVX >= 2) {
4032       // With AVX2, use 32-byte vector compare
4033       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4034 
4035       // Compare 32-byte vectors
4036       testl(len, 0xffffffe0);   // vector count (in bytes)
4037       jccb(Assembler::zero, TAIL_START);
4038 
4039       andl(len, 0xffffffe0);
4040       lea(ary1, Address(ary1, len, Address::times_1));
4041       negptr(len);
4042 
4043       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4044       movdl(vec2, tmp1);
4045       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4046 
4047       bind(COMPARE_WIDE_VECTORS);
4048       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4049       vptest(vec1, vec2);
4050       jccb(Assembler::notZero, BREAK_LOOP);
4051       addptr(len, 32);
4052       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4053 
4054       testl(result, 0x0000001f);   // any bytes remaining?
4055       jcc(Assembler::zero, DONE);
4056 
4057       // Quick test using the already prepared vector mask
4058       movl(len, result);
4059       andl(len, 0x0000001f);
4060       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4061       vptest(vec1, vec2);
4062       jcc(Assembler::zero, DONE);
4063       // There are zeros, jump to the tail to determine exactly where
4064       jmpb(TAIL_START);
4065 
4066       bind(BREAK_LOOP);
4067       // At least one byte in the last 32-byte vector is negative.
4068       // Set up to look at the last 32 bytes as if they were a tail
4069       lea(ary1, Address(ary1, len, Address::times_1));
4070       addptr(result, len);
4071       // Ignore the very last byte: if all others are positive,
4072       // it must be negative, so we can skip right to the 2+1 byte
4073       // end comparison at this point
4074       orl(result, 31);
4075       movl(len, 31);
4076       // Fallthru to tail compare
4077     } else if (UseSSE42Intrinsics) {
4078       // With SSE4.2, use double quad vector compare
4079       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4080 
4081       // Compare 16-byte vectors
4082       testl(len, 0xfffffff0);   // vector count (in bytes)
4083       jcc(Assembler::zero, TAIL_START);
4084 
4085       andl(len, 0xfffffff0);
4086       lea(ary1, Address(ary1, len, Address::times_1));
4087       negptr(len);
4088 
4089       movl(tmp1, 0x80808080);
4090       movdl(vec2, tmp1);
4091       pshufd(vec2, vec2, 0);
4092 
4093       bind(COMPARE_WIDE_VECTORS);
4094       movdqu(vec1, Address(ary1, len, Address::times_1));
4095       ptest(vec1, vec2);
4096       jccb(Assembler::notZero, BREAK_LOOP);
4097       addptr(len, 16);
4098       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4099 
4100       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4101       jcc(Assembler::zero, DONE);
4102 
4103       // Quick test using the already prepared vector mask
4104       movl(len, result);
4105       andl(len, 0x0000000f);   // tail count (in bytes)
4106       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4107       ptest(vec1, vec2);
4108       jcc(Assembler::zero, DONE);
4109       jmpb(TAIL_START);
4110 
4111       bind(BREAK_LOOP);
4112       // At least one byte in the last 16-byte vector is negative.
4113       // Set up and look at the last 16 bytes as if they were a tail
4114       lea(ary1, Address(ary1, len, Address::times_1));
4115       addptr(result, len);
4116       // Ignore the very last byte: if all others are positive,
4117       // it must be negative, so we can skip right to the 2+1 byte
4118       // end comparison at this point
4119       orl(result, 15);
4120       movl(len, 15);
4121       // Fallthru to tail compare
4122     }
4123   }
4124 
4125   bind(TAIL_START);
4126   // Compare 4-byte vectors
4127   andl(len, 0xfffffffc); // vector count (in bytes)
4128   jccb(Assembler::zero, COMPARE_CHAR);
4129 
4130   lea(ary1, Address(ary1, len, Address::times_1));
4131   negptr(len);
4132 
4133   bind(COMPARE_VECTORS);
4134   movl(tmp1, Address(ary1, len, Address::times_1));
4135   andl(tmp1, 0x80808080);
4136   jccb(Assembler::notZero, TAIL_ADJUST);
4137   addptr(len, 4);
4138   jccb(Assembler::notZero, COMPARE_VECTORS);
4139 
4140   // Compare trailing char (final 2-3 bytes), if any
4141   bind(COMPARE_CHAR);
4142 
4143   testl(result, 0x2);   // tail  char
4144   jccb(Assembler::zero, COMPARE_BYTE);
4145   load_unsigned_short(tmp1, Address(ary1, 0));
4146   andl(tmp1, 0x00008080);
4147   jccb(Assembler::notZero, CHAR_ADJUST);
4148   lea(ary1, Address(ary1, 2));
4149 
4150   bind(COMPARE_BYTE);
4151   testl(result, 0x1);   // tail  byte
4152   jccb(Assembler::zero, DONE);
4153   load_unsigned_byte(tmp1, Address(ary1, 0));
4154   testl(tmp1, 0x00000080);
4155   jccb(Assembler::zero, DONE);
4156   subptr(result, 1);
4157   jmpb(DONE);
4158 
4159   bind(TAIL_ADJUST);
4160   // there are negative bits in the last 4 byte block.
4161   // Adjust result and check the next three bytes
4162   addptr(result, len);
4163   orl(result, 3);
4164   lea(ary1, Address(ary1, len, Address::times_1));
4165   jmpb(COMPARE_CHAR);
4166 
4167   bind(CHAR_ADJUST);
4168   // We are looking at a char + optional byte tail, and found that one
4169   // of the bytes in the char is negative. Adjust the result, check the
4170   // first byte and readjust if needed.
4171   andl(result, 0xfffffffc);
4172   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4173   jccb(Assembler::notZero, DONE);
4174   addptr(result, 1);
4175 
4176   // That's it
4177   bind(DONE);
4178   if (UseAVX >= 2) {
4179     // clean upper bits of YMM registers
4180     vpxor(vec1, vec1);
4181     vpxor(vec2, vec2);
4182   }
4183 }
4184 
4185 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4186 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4187                                       Register limit, Register result, Register chr,
4188                                       XMMRegister vec1, XMMRegister vec2, bool is_char,
4189                                       KRegister mask, bool expand_ary2) {
4190   // for expand_ary2, limit is the (smaller) size of the second array.
4191   ShortBranchVerifier sbv(this);
4192   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4193 
4194   assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4195          "Expansion only implemented for AVX2");
4196 
4197   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4198   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4199 
4200   Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4201   int scaleIncr = expand_ary2 ? 8 : 16;
4202 
4203   if (is_array_equ) {
4204     // Check the input args
4205     cmpoop(ary1, ary2);
4206     jcc(Assembler::equal, TRUE_LABEL);
4207 
4208     // Need additional checks for arrays_equals.
4209     testptr(ary1, ary1);
4210     jcc(Assembler::zero, FALSE_LABEL);
4211     testptr(ary2, ary2);
4212     jcc(Assembler::zero, FALSE_LABEL);
4213 
4214     // Check the lengths
4215     movl(limit, Address(ary1, length_offset));
4216     cmpl(limit, Address(ary2, length_offset));
4217     jcc(Assembler::notEqual, FALSE_LABEL);
4218   }
4219 
4220   // count == 0
4221   testl(limit, limit);
4222   jcc(Assembler::zero, TRUE_LABEL);
4223 
4224   if (is_array_equ) {
4225     // Load array address
4226     lea(ary1, Address(ary1, base_offset));
4227     lea(ary2, Address(ary2, base_offset));
4228   }
4229 
4230   if (is_array_equ && is_char) {
4231     // arrays_equals when used for char[].
4232     shll(limit, 1);      // byte count != 0
4233   }
4234   movl(result, limit); // copy
4235 
4236   if (UseAVX >= 2) {
4237     // With AVX2, use 32-byte vector compare
4238     Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4239 
4240     // Compare 32-byte vectors
4241     if (expand_ary2) {
4242       andl(result, 0x0000000f);  //   tail count (in bytes)
4243       andl(limit, 0xfffffff0);   // vector count (in bytes)
4244       jcc(Assembler::zero, COMPARE_TAIL);
4245     } else {
4246       andl(result, 0x0000001f);  //   tail count (in bytes)
4247       andl(limit, 0xffffffe0);   // vector count (in bytes)
4248       jcc(Assembler::zero, COMPARE_TAIL_16);
4249     }
4250 
4251     lea(ary1, Address(ary1, limit, scaleFactor));
4252     lea(ary2, Address(ary2, limit, Address::times_1));
4253     negptr(limit);
4254 
4255     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4256       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4257 
4258       cmpl(limit, -64);
4259       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4260 
4261       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4262 
4263       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4264       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4265       kortestql(mask, mask);
4266       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4267       addptr(limit, 64);  // update since we already compared at this addr
4268       cmpl(limit, -64);
4269       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4270 
4271       // At this point we may still need to compare -limit+result bytes.
4272       // We could execute the next two instruction and just continue via non-wide path:
4273       //  cmpl(limit, 0);
4274       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4275       // But since we stopped at the points ary{1,2}+limit which are
4276       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4277       // (|limit| <= 32 and result < 32),
4278       // we may just compare the last 64 bytes.
4279       //
4280       addptr(result, -64);   // it is safe, bc we just came from this area
4281       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4282       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4283       kortestql(mask, mask);
4284       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4285 
4286       jmp(TRUE_LABEL);
4287 
4288       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4289 
4290     }//if (VM_Version::supports_avx512vlbw())
4291 
4292     bind(COMPARE_WIDE_VECTORS);
4293     vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4294     if (expand_ary2) {
4295       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4296     } else {
4297       vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4298     }
4299     vpxor(vec1, vec2);
4300 
4301     vptest(vec1, vec1);
4302     jcc(Assembler::notZero, FALSE_LABEL);
4303     addptr(limit, scaleIncr * 2);
4304     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4305 
4306     testl(result, result);
4307     jcc(Assembler::zero, TRUE_LABEL);
4308 
4309     vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4310     if (expand_ary2) {
4311       vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4312     } else {
4313       vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4314     }
4315     vpxor(vec1, vec2);
4316 
4317     vptest(vec1, vec1);
4318     jcc(Assembler::notZero, FALSE_LABEL);
4319     jmp(TRUE_LABEL);
4320 
4321     bind(COMPARE_TAIL_16); // limit is zero
4322     movl(limit, result);
4323 
4324     // Compare 16-byte chunks
4325     andl(result, 0x0000000f);  //   tail count (in bytes)
4326     andl(limit, 0xfffffff0);   // vector count (in bytes)
4327     jcc(Assembler::zero, COMPARE_TAIL);
4328 
4329     lea(ary1, Address(ary1, limit, scaleFactor));
4330     lea(ary2, Address(ary2, limit, Address::times_1));
4331     negptr(limit);
4332 
4333     bind(COMPARE_WIDE_VECTORS_16);
4334     movdqu(vec1, Address(ary1, limit, scaleFactor));
4335     if (expand_ary2) {
4336       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4337     } else {
4338       movdqu(vec2, Address(ary2, limit, Address::times_1));
4339     }
4340     pxor(vec1, vec2);
4341 
4342     ptest(vec1, vec1);
4343     jcc(Assembler::notZero, FALSE_LABEL);
4344     addptr(limit, scaleIncr);
4345     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4346 
4347     bind(COMPARE_TAIL); // limit is zero
4348     movl(limit, result);
4349     // Fallthru to tail compare
4350   } else if (UseSSE42Intrinsics) {
4351     // With SSE4.2, use double quad vector compare
4352     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4353 
4354     // Compare 16-byte vectors
4355     andl(result, 0x0000000f);  //   tail count (in bytes)
4356     andl(limit, 0xfffffff0);   // vector count (in bytes)
4357     jcc(Assembler::zero, COMPARE_TAIL);
4358 
4359     lea(ary1, Address(ary1, limit, Address::times_1));
4360     lea(ary2, Address(ary2, limit, Address::times_1));
4361     negptr(limit);
4362 
4363     bind(COMPARE_WIDE_VECTORS);
4364     movdqu(vec1, Address(ary1, limit, Address::times_1));
4365     movdqu(vec2, Address(ary2, limit, Address::times_1));
4366     pxor(vec1, vec2);
4367 
4368     ptest(vec1, vec1);
4369     jcc(Assembler::notZero, FALSE_LABEL);
4370     addptr(limit, 16);
4371     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4372 
4373     testl(result, result);
4374     jcc(Assembler::zero, TRUE_LABEL);
4375 
4376     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4377     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4378     pxor(vec1, vec2);
4379 
4380     ptest(vec1, vec1);
4381     jccb(Assembler::notZero, FALSE_LABEL);
4382     jmpb(TRUE_LABEL);
4383 
4384     bind(COMPARE_TAIL); // limit is zero
4385     movl(limit, result);
4386     // Fallthru to tail compare
4387   }
4388 
4389   // Compare 4-byte vectors
4390   if (expand_ary2) {
4391     testl(result, result);
4392     jccb(Assembler::zero, TRUE_LABEL);
4393   } else {
4394     andl(limit, 0xfffffffc); // vector count (in bytes)
4395     jccb(Assembler::zero, COMPARE_CHAR);
4396   }
4397 
4398   lea(ary1, Address(ary1, limit, scaleFactor));
4399   lea(ary2, Address(ary2, limit, Address::times_1));
4400   negptr(limit);
4401 
4402   bind(COMPARE_VECTORS);
4403   if (expand_ary2) {
4404     // There are no "vector" operations for bytes to shorts
4405     movzbl(chr, Address(ary2, limit, Address::times_1));
4406     cmpw(Address(ary1, limit, Address::times_2), chr);
4407     jccb(Assembler::notEqual, FALSE_LABEL);
4408     addptr(limit, 1);
4409     jcc(Assembler::notZero, COMPARE_VECTORS);
4410     jmp(TRUE_LABEL);
4411   } else {
4412     movl(chr, Address(ary1, limit, Address::times_1));
4413     cmpl(chr, Address(ary2, limit, Address::times_1));
4414     jccb(Assembler::notEqual, FALSE_LABEL);
4415     addptr(limit, 4);
4416     jcc(Assembler::notZero, COMPARE_VECTORS);
4417   }
4418 
4419   // Compare trailing char (final 2 bytes), if any
4420   bind(COMPARE_CHAR);
4421   testl(result, 0x2);   // tail  char
4422   jccb(Assembler::zero, COMPARE_BYTE);
4423   load_unsigned_short(chr, Address(ary1, 0));
4424   load_unsigned_short(limit, Address(ary2, 0));
4425   cmpl(chr, limit);
4426   jccb(Assembler::notEqual, FALSE_LABEL);
4427 
4428   if (is_array_equ && is_char) {
4429     bind(COMPARE_BYTE);
4430   } else {
4431     lea(ary1, Address(ary1, 2));
4432     lea(ary2, Address(ary2, 2));
4433 
4434     bind(COMPARE_BYTE);
4435     testl(result, 0x1);   // tail  byte
4436     jccb(Assembler::zero, TRUE_LABEL);
4437     load_unsigned_byte(chr, Address(ary1, 0));
4438     load_unsigned_byte(limit, Address(ary2, 0));
4439     cmpl(chr, limit);
4440     jccb(Assembler::notEqual, FALSE_LABEL);
4441   }
4442   bind(TRUE_LABEL);
4443   movl(result, 1);   // return true
4444   jmpb(DONE);
4445 
4446   bind(FALSE_LABEL);
4447   xorl(result, result); // return false
4448 
4449   // That's it
4450   bind(DONE);
4451   if (UseAVX >= 2) {
4452     // clean upper bits of YMM registers
4453     vpxor(vec1, vec1);
4454     vpxor(vec2, vec2);
4455   }
4456 }
4457 
4458 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4459 #define __ masm.
4460   Register dst = stub.data<0>();
4461   XMMRegister src = stub.data<1>();
4462   address target = stub.data<2>();
4463   __ bind(stub.entry());
4464   __ subptr(rsp, 8);
4465   __ movdbl(Address(rsp), src);
4466   __ call(RuntimeAddress(target));
4467   // APX REX2 encoding for pop(dst) increases the stub size by 1 byte.
4468   __ pop(dst);
4469   __ jmp(stub.continuation());
4470 #undef __
4471 }
4472 
4473 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4474   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4475   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4476 
4477   address slowpath_target;
4478   if (dst_bt == T_INT) {
4479     if (src_bt == T_FLOAT) {
4480       cvttss2sil(dst, src);
4481       cmpl(dst, 0x80000000);
4482       slowpath_target = StubRoutines::x86::f2i_fixup();
4483     } else {
4484       cvttsd2sil(dst, src);
4485       cmpl(dst, 0x80000000);
4486       slowpath_target = StubRoutines::x86::d2i_fixup();
4487     }
4488   } else {
4489     if (src_bt == T_FLOAT) {
4490       cvttss2siq(dst, src);
4491       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4492       slowpath_target = StubRoutines::x86::f2l_fixup();
4493     } else {
4494       cvttsd2siq(dst, src);
4495       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4496       slowpath_target = StubRoutines::x86::d2l_fixup();
4497     }
4498   }
4499 
4500   // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte.
4501   int max_size = 23 + (UseAPX ? 1 : 0);
4502   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath);
4503   jcc(Assembler::equal, stub->entry());
4504   bind(stub->continuation());
4505 }
4506 
4507 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4508                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4509   switch(ideal_opc) {
4510     case Op_LShiftVS:
4511       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4512     case Op_LShiftVI:
4513       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4514     case Op_LShiftVL:
4515       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4516     case Op_RShiftVS:
4517       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4518     case Op_RShiftVI:
4519       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4520     case Op_RShiftVL:
4521       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4522     case Op_URShiftVS:
4523       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4524     case Op_URShiftVI:
4525       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4526     case Op_URShiftVL:
4527       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4528     case Op_RotateRightV:
4529       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4530     case Op_RotateLeftV:
4531       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4532     default:
4533       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4534       break;
4535   }
4536 }
4537 
4538 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4539                                                XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4540   if (is_unsigned) {
4541     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4542   } else {
4543     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4544   }
4545 }
4546 
4547 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4548                                                       XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4549   switch (elem_bt) {
4550     case T_BYTE:
4551       if (ideal_opc == Op_SaturatingAddV) {
4552         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4553       } else {
4554         assert(ideal_opc == Op_SaturatingSubV, "");
4555         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4556       }
4557       break;
4558     case T_SHORT:
4559       if (ideal_opc == Op_SaturatingAddV) {
4560         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4561       } else {
4562         assert(ideal_opc == Op_SaturatingSubV, "");
4563         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4564       }
4565       break;
4566     default:
4567       fatal("Unsupported type %s", type2name(elem_bt));
4568       break;
4569   }
4570 }
4571 
4572 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4573                                                         XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4574   switch (elem_bt) {
4575     case T_BYTE:
4576       if (ideal_opc == Op_SaturatingAddV) {
4577         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4578       } else {
4579         assert(ideal_opc == Op_SaturatingSubV, "");
4580         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4581       }
4582       break;
4583     case T_SHORT:
4584       if (ideal_opc == Op_SaturatingAddV) {
4585         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4586       } else {
4587         assert(ideal_opc == Op_SaturatingSubV, "");
4588         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4589       }
4590       break;
4591     default:
4592       fatal("Unsupported type %s", type2name(elem_bt));
4593       break;
4594   }
4595 }
4596 
4597 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4598                                                Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4599   if (is_unsigned) {
4600     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4601   } else {
4602     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4603   }
4604 }
4605 
4606 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4607                                                       XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4608   switch (elem_bt) {
4609     case T_BYTE:
4610       if (ideal_opc == Op_SaturatingAddV) {
4611         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4612       } else {
4613         assert(ideal_opc == Op_SaturatingSubV, "");
4614         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4615       }
4616       break;
4617     case T_SHORT:
4618       if (ideal_opc == Op_SaturatingAddV) {
4619         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4620       } else {
4621         assert(ideal_opc == Op_SaturatingSubV, "");
4622         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4623       }
4624       break;
4625     default:
4626       fatal("Unsupported type %s", type2name(elem_bt));
4627       break;
4628   }
4629 }
4630 
4631 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4632                                                         XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4633   switch (elem_bt) {
4634     case T_BYTE:
4635       if (ideal_opc == Op_SaturatingAddV) {
4636         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4637       } else {
4638         assert(ideal_opc == Op_SaturatingSubV, "");
4639         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4640       }
4641       break;
4642     case T_SHORT:
4643       if (ideal_opc == Op_SaturatingAddV) {
4644         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4645       } else {
4646         assert(ideal_opc == Op_SaturatingSubV, "");
4647         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4648       }
4649       break;
4650     default:
4651       fatal("Unsupported type %s", type2name(elem_bt));
4652       break;
4653   }
4654 }
4655 
4656 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4657                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4658                                     bool is_varshift) {
4659   switch (ideal_opc) {
4660     case Op_AddVB:
4661       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4662     case Op_AddVS:
4663       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4664     case Op_AddVI:
4665       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4666     case Op_AddVL:
4667       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4668     case Op_AddVF:
4669       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4670     case Op_AddVD:
4671       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4672     case Op_SubVB:
4673       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4674     case Op_SubVS:
4675       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4676     case Op_SubVI:
4677       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4678     case Op_SubVL:
4679       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4680     case Op_SubVF:
4681       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4682     case Op_SubVD:
4683       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4684     case Op_MulVS:
4685       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4686     case Op_MulVI:
4687       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4688     case Op_MulVL:
4689       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4690     case Op_MulVF:
4691       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4692     case Op_MulVD:
4693       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4694     case Op_DivVF:
4695       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4696     case Op_DivVD:
4697       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4698     case Op_SqrtVF:
4699       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4700     case Op_SqrtVD:
4701       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4702     case Op_AbsVB:
4703       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4704     case Op_AbsVS:
4705       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4706     case Op_AbsVI:
4707       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4708     case Op_AbsVL:
4709       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4710     case Op_FmaVF:
4711       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4712     case Op_FmaVD:
4713       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4714     case Op_VectorRearrange:
4715       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4716     case Op_LShiftVS:
4717       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4718     case Op_LShiftVI:
4719       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4720     case Op_LShiftVL:
4721       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4722     case Op_RShiftVS:
4723       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4724     case Op_RShiftVI:
4725       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4726     case Op_RShiftVL:
4727       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4728     case Op_URShiftVS:
4729       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4730     case Op_URShiftVI:
4731       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4732     case Op_URShiftVL:
4733       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4734     case Op_RotateLeftV:
4735       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4736     case Op_RotateRightV:
4737       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4738     case Op_MaxV:
4739       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4740     case Op_MinV:
4741       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4742     case Op_UMinV:
4743       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4744     case Op_UMaxV:
4745       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4746     case Op_XorV:
4747       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4748     case Op_OrV:
4749       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4750     case Op_AndV:
4751       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4752     default:
4753       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4754       break;
4755   }
4756 }
4757 
4758 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4759                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4760   switch (ideal_opc) {
4761     case Op_AddVB:
4762       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4763     case Op_AddVS:
4764       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4765     case Op_AddVI:
4766       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4767     case Op_AddVL:
4768       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4769     case Op_AddVF:
4770       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4771     case Op_AddVD:
4772       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4773     case Op_SubVB:
4774       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4775     case Op_SubVS:
4776       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4777     case Op_SubVI:
4778       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4779     case Op_SubVL:
4780       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4781     case Op_SubVF:
4782       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4783     case Op_SubVD:
4784       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4785     case Op_MulVS:
4786       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4787     case Op_MulVI:
4788       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4789     case Op_MulVL:
4790       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4791     case Op_MulVF:
4792       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4793     case Op_MulVD:
4794       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4795     case Op_DivVF:
4796       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4797     case Op_DivVD:
4798       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4799     case Op_FmaVF:
4800       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4801     case Op_FmaVD:
4802       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4803     case Op_MaxV:
4804       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4805     case Op_MinV:
4806       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4807     case Op_UMaxV:
4808       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4809     case Op_UMinV:
4810       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4811     case Op_XorV:
4812       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4813     case Op_OrV:
4814       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4815     case Op_AndV:
4816       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4817     default:
4818       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4819       break;
4820   }
4821 }
4822 
4823 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4824                                   KRegister src1, KRegister src2) {
4825   BasicType etype = T_ILLEGAL;
4826   switch(mask_len) {
4827     case 2:
4828     case 4:
4829     case 8:  etype = T_BYTE; break;
4830     case 16: etype = T_SHORT; break;
4831     case 32: etype = T_INT; break;
4832     case 64: etype = T_LONG; break;
4833     default: fatal("Unsupported type"); break;
4834   }
4835   assert(etype != T_ILLEGAL, "");
4836   switch(ideal_opc) {
4837     case Op_AndVMask:
4838       kand(etype, dst, src1, src2); break;
4839     case Op_OrVMask:
4840       kor(etype, dst, src1, src2); break;
4841     case Op_XorVMask:
4842       kxor(etype, dst, src1, src2); break;
4843     default:
4844       fatal("Unsupported masked operation"); break;
4845   }
4846 }
4847 
4848 /*
4849  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4850  * If src is NaN, the result is 0.
4851  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4852  * the result is equal to the value of Integer.MIN_VALUE.
4853  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4854  * the result is equal to the value of Integer.MAX_VALUE.
4855  */
4856 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4857                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4858                                                                    Register rscratch, AddressLiteral float_sign_flip,
4859                                                                    int vec_enc) {
4860   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4861   Label done;
4862   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4863   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4864   vptest(xtmp2, xtmp2, vec_enc);
4865   jccb(Assembler::equal, done);
4866 
4867   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4868   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4869 
4870   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4871   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4872   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4873 
4874   // Recompute the mask for remaining special value.
4875   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4876   // Extract SRC values corresponding to TRUE mask lanes.
4877   vpand(xtmp4, xtmp2, src, vec_enc);
4878   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4879   // values are set.
4880   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4881 
4882   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4883   bind(done);
4884 }
4885 
4886 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4887                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4888                                                                     Register rscratch, AddressLiteral float_sign_flip,
4889                                                                     int vec_enc) {
4890   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4891   Label done;
4892   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4893   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4894   kortestwl(ktmp1, ktmp1);
4895   jccb(Assembler::equal, done);
4896 
4897   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4898   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4899   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4900 
4901   kxorwl(ktmp1, ktmp1, ktmp2);
4902   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4903   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4904   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4905   bind(done);
4906 }
4907 
4908 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4909                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4910                                                                      Register rscratch, AddressLiteral double_sign_flip,
4911                                                                      int vec_enc) {
4912   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4913 
4914   Label done;
4915   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4916   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4917   kortestwl(ktmp1, ktmp1);
4918   jccb(Assembler::equal, done);
4919 
4920   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4921   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4922   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4923 
4924   kxorwl(ktmp1, ktmp1, ktmp2);
4925   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4926   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4927   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4928   bind(done);
4929 }
4930 
4931 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4932                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4933                                                                      Register rscratch, AddressLiteral float_sign_flip,
4934                                                                      int vec_enc) {
4935   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4936   Label done;
4937   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4938   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4939   kortestwl(ktmp1, ktmp1);
4940   jccb(Assembler::equal, done);
4941 
4942   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4943   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4944   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4945 
4946   kxorwl(ktmp1, ktmp1, ktmp2);
4947   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4948   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4949   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4950   bind(done);
4951 }
4952 
4953 /*
4954  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4955  * If src is NaN, the result is 0.
4956  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
4957  * the result is equal to the value of Long.MIN_VALUE.
4958  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
4959  * the result is equal to the value of Long.MAX_VALUE.
4960  */
4961 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4962                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4963                                                                       Register rscratch, AddressLiteral double_sign_flip,
4964                                                                       int vec_enc) {
4965   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4966 
4967   Label done;
4968   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4969   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
4970   kortestwl(ktmp1, ktmp1);
4971   jccb(Assembler::equal, done);
4972 
4973   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4974   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4975   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4976 
4977   kxorwl(ktmp1, ktmp1, ktmp2);
4978   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4979   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4980   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4981   bind(done);
4982 }
4983 
4984 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
4985                                                              XMMRegister xtmp, int index, int vec_enc) {
4986    assert(vec_enc < Assembler::AVX_512bit, "");
4987    if (vec_enc == Assembler::AVX_256bit) {
4988      vextractf128_high(xtmp, src);
4989      vshufps(dst, src, xtmp, index, vec_enc);
4990    } else {
4991      vshufps(dst, src, zero, index, vec_enc);
4992    }
4993 }
4994 
4995 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4996                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
4997                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
4998   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4999 
5000   Label done;
5001   // Compare the destination lanes with float_sign_flip
5002   // value to get mask for all special values.
5003   movdqu(xtmp1, float_sign_flip, rscratch);
5004   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5005   ptest(xtmp2, xtmp2);
5006   jccb(Assembler::equal, done);
5007 
5008   // Flip float_sign_flip to get max integer value.
5009   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5010   pxor(xtmp1, xtmp4);
5011 
5012   // Set detination lanes corresponding to unordered source lanes as zero.
5013   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5014   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5015 
5016   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5017   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5018   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5019 
5020   // Recompute the mask for remaining special value.
5021   pxor(xtmp2, xtmp3);
5022   // Extract mask corresponding to non-negative source lanes.
5023   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5024 
5025   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5026   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5027   pand(xtmp3, xtmp2);
5028 
5029   // Replace destination lanes holding special value(0x80000000) with max int
5030   // if corresponding source lane holds a +ve value.
5031   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5032   bind(done);
5033 }
5034 
5035 
5036 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5037                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5038   switch(to_elem_bt) {
5039     case T_SHORT:
5040       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5041       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5042       vpackusdw(dst, dst, zero, vec_enc);
5043       if (vec_enc == Assembler::AVX_256bit) {
5044         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5045       }
5046       break;
5047     case  T_BYTE:
5048       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5049       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5050       vpackusdw(dst, dst, zero, vec_enc);
5051       if (vec_enc == Assembler::AVX_256bit) {
5052         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5053       }
5054       vpackuswb(dst, dst, zero, vec_enc);
5055       break;
5056     default: assert(false, "Unexpected basic type for target of vector cast int to subword: %s", type2name(to_elem_bt));
5057   }
5058 }
5059 
5060 /*
5061  * Algorithm for vector D2L and F2I conversions (AVX 10.2 unsupported):-
5062  * a) Perform vector D2L/F2I cast.
5063  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5064  *    It signifies that source value could be any of the special floating point
5065  *    values(NaN,-Inf,Inf,Max,-Min).
5066  * c) Set destination to zero if source is NaN value.
5067  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5068  */
5069 
5070 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5071                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5072                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5073   int to_elem_sz = type2aelembytes(to_elem_bt);
5074   assert(to_elem_sz <= 4, "");
5075   vcvttps2dq(dst, src, vec_enc);
5076   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5077   if (to_elem_sz < 4) {
5078     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5079     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5080   }
5081 }
5082 
5083 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5084                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5085                                             Register rscratch, int vec_enc) {
5086   int to_elem_sz = type2aelembytes(to_elem_bt);
5087   assert(to_elem_sz <= 4, "");
5088   vcvttps2dq(dst, src, vec_enc);
5089   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5090   switch(to_elem_bt) {
5091     case T_INT:
5092       break;
5093     case T_SHORT:
5094       evpmovdw(dst, dst, vec_enc);
5095       break;
5096     case T_BYTE:
5097       evpmovdb(dst, dst, vec_enc);
5098       break;
5099     default: assert(false, "Unexpected basic type for target of vector castF2X EVEX: %s", type2name(to_elem_bt));
5100   }
5101 }
5102 
5103 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5104                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5105                                             Register rscratch, int vec_enc) {
5106   evcvttps2qq(dst, src, vec_enc);
5107   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5108 }
5109 
5110 // Handling for downcasting from double to integer or sub-word types on AVX2.
5111 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5112                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5113                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5114   int to_elem_sz = type2aelembytes(to_elem_bt);
5115   assert(to_elem_sz < 8, "");
5116   vcvttpd2dq(dst, src, vec_enc);
5117   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5118                                               float_sign_flip, vec_enc);
5119   if (to_elem_sz < 4) {
5120     // xtmp4 holds all zero lanes.
5121     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5122   }
5123 }
5124 
5125 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5126                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5127                                             KRegister ktmp2, AddressLiteral sign_flip,
5128                                             Register rscratch, int vec_enc) {
5129   if (VM_Version::supports_avx512dq()) {
5130     evcvttpd2qq(dst, src, vec_enc);
5131     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5132     switch(to_elem_bt) {
5133       case T_LONG:
5134         break;
5135       case T_INT:
5136         evpmovsqd(dst, dst, vec_enc);
5137         break;
5138       case T_SHORT:
5139         evpmovsqd(dst, dst, vec_enc);
5140         evpmovdw(dst, dst, vec_enc);
5141         break;
5142       case T_BYTE:
5143         evpmovsqd(dst, dst, vec_enc);
5144         evpmovdb(dst, dst, vec_enc);
5145         break;
5146       default: assert(false, "Unexpected basic type for target of vector castD2X AVX512DQ EVEX: %s", type2name(to_elem_bt));
5147     }
5148   } else {
5149     assert(type2aelembytes(to_elem_bt) <= 4, "");
5150     vcvttpd2dq(dst, src, vec_enc);
5151     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5152     switch(to_elem_bt) {
5153       case T_INT:
5154         break;
5155       case T_SHORT:
5156         evpmovdw(dst, dst, vec_enc);
5157         break;
5158       case T_BYTE:
5159         evpmovdb(dst, dst, vec_enc);
5160         break;
5161       default: assert(false, "Unexpected basic type for target of vector castD2X EVEX: %s", type2name(to_elem_bt));
5162     }
5163   }
5164 }
5165 
5166 void C2_MacroAssembler::vector_castF2X_avx10(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5167   switch(to_elem_bt) {
5168     case T_LONG:
5169       evcvttps2qqs(dst, src, vec_enc);
5170       break;
5171     case T_INT:
5172       evcvttps2dqs(dst, src, vec_enc);
5173       break;
5174     case T_SHORT:
5175       evcvttps2dqs(dst, src, vec_enc);
5176       evpmovdw(dst, dst, vec_enc);
5177       break;
5178     case T_BYTE:
5179       evcvttps2dqs(dst, src, vec_enc);
5180       evpmovdb(dst, dst, vec_enc);
5181       break;
5182     default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (reg src): %s", type2name(to_elem_bt));
5183   }
5184 }
5185 
5186 void C2_MacroAssembler::vector_castF2X_avx10(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5187   switch(to_elem_bt) {
5188     case T_LONG:
5189       evcvttps2qqs(dst, src, vec_enc);
5190       break;
5191     case T_INT:
5192       evcvttps2dqs(dst, src, vec_enc);
5193       break;
5194     case T_SHORT:
5195       evcvttps2dqs(dst, src, vec_enc);
5196       evpmovdw(dst, dst, vec_enc);
5197       break;
5198     case T_BYTE:
5199       evcvttps2dqs(dst, src, vec_enc);
5200       evpmovdb(dst, dst, vec_enc);
5201       break;
5202     default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (mem src): %s", type2name(to_elem_bt));
5203   }
5204 }
5205 
5206 void C2_MacroAssembler::vector_castD2X_avx10(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5207   switch(to_elem_bt) {
5208     case T_LONG:
5209       evcvttpd2qqs(dst, src, vec_enc);
5210       break;
5211     case T_INT:
5212       evcvttpd2dqs(dst, src, vec_enc);
5213       break;
5214     case T_SHORT:
5215       evcvttpd2dqs(dst, src, vec_enc);
5216       evpmovdw(dst, dst, vec_enc);
5217       break;
5218     case T_BYTE:
5219       evcvttpd2dqs(dst, src, vec_enc);
5220       evpmovdb(dst, dst, vec_enc);
5221       break;
5222     default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (reg src): %s", type2name(to_elem_bt));
5223   }
5224 }
5225 
5226 void C2_MacroAssembler::vector_castD2X_avx10(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5227   switch(to_elem_bt) {
5228     case T_LONG:
5229       evcvttpd2qqs(dst, src, vec_enc);
5230       break;
5231     case T_INT:
5232       evcvttpd2dqs(dst, src, vec_enc);
5233       break;
5234     case T_SHORT:
5235       evcvttpd2dqs(dst, src, vec_enc);
5236       evpmovdw(dst, dst, vec_enc);
5237       break;
5238     case T_BYTE:
5239       evcvttpd2dqs(dst, src, vec_enc);
5240       evpmovdb(dst, dst, vec_enc);
5241       break;
5242     default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (mem src): %s", type2name(to_elem_bt));
5243   }
5244 }
5245 
5246 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5247                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5248                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5249   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5250   // and re-instantiate original MXCSR.RC mode after that.
5251   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5252 
5253   mov64(tmp, julong_cast(0.5L));
5254   evpbroadcastq(xtmp1, tmp, vec_enc);
5255   vaddpd(xtmp1, src , xtmp1, vec_enc);
5256   evcvtpd2qq(dst, xtmp1, vec_enc);
5257   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5258                                                 double_sign_flip, vec_enc);;
5259 
5260   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5261 }
5262 
5263 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5264                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5265                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5266   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5267   // and re-instantiate original MXCSR.RC mode after that.
5268   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5269 
5270   movl(tmp, jint_cast(0.5));
5271   movq(xtmp1, tmp);
5272   vbroadcastss(xtmp1, xtmp1, vec_enc);
5273   vaddps(xtmp1, src , xtmp1, vec_enc);
5274   vcvtps2dq(dst, xtmp1, vec_enc);
5275   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5276                                               float_sign_flip, vec_enc);
5277 
5278   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5279 }
5280 
5281 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5282                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5283                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5284   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5285   // and re-instantiate original MXCSR.RC mode after that.
5286   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5287 
5288   movl(tmp, jint_cast(0.5));
5289   movq(xtmp1, tmp);
5290   vbroadcastss(xtmp1, xtmp1, vec_enc);
5291   vaddps(xtmp1, src , xtmp1, vec_enc);
5292   vcvtps2dq(dst, xtmp1, vec_enc);
5293   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5294 
5295   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5296 }
5297 
5298 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5299                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5300   switch (from_elem_bt) {
5301     case T_BYTE:
5302       switch (to_elem_bt) {
5303         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5304         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5305         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5306         default: ShouldNotReachHere();
5307       }
5308       break;
5309     case T_SHORT:
5310       switch (to_elem_bt) {
5311         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5312         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5313         default: ShouldNotReachHere();
5314       }
5315       break;
5316     case T_INT:
5317       assert(to_elem_bt == T_LONG, "");
5318       vpmovzxdq(dst, src, vlen_enc);
5319       break;
5320     default:
5321       ShouldNotReachHere();
5322   }
5323 }
5324 
5325 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5326                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5327   switch (from_elem_bt) {
5328     case T_BYTE:
5329       switch (to_elem_bt) {
5330         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5331         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5332         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5333         default: ShouldNotReachHere();
5334       }
5335       break;
5336     case T_SHORT:
5337       switch (to_elem_bt) {
5338         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5339         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5340         default: ShouldNotReachHere();
5341       }
5342       break;
5343     case T_INT:
5344       assert(to_elem_bt == T_LONG, "");
5345       vpmovsxdq(dst, src, vlen_enc);
5346       break;
5347     default:
5348       ShouldNotReachHere();
5349   }
5350 }
5351 
5352 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5353                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5354   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5355   assert(vlen_enc != AVX_512bit, "");
5356 
5357   int dst_bt_size = type2aelembytes(dst_bt);
5358   int src_bt_size = type2aelembytes(src_bt);
5359   if (dst_bt_size > src_bt_size) {
5360     switch (dst_bt_size / src_bt_size) {
5361       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5362       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5363       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5364       default: ShouldNotReachHere();
5365     }
5366   } else {
5367     assert(dst_bt_size < src_bt_size, "");
5368     switch (src_bt_size / dst_bt_size) {
5369       case 2: {
5370         if (vlen_enc == AVX_128bit) {
5371           vpacksswb(dst, src, src, vlen_enc);
5372         } else {
5373           vpacksswb(dst, src, src, vlen_enc);
5374           vpermq(dst, dst, 0x08, vlen_enc);
5375         }
5376         break;
5377       }
5378       case 4: {
5379         if (vlen_enc == AVX_128bit) {
5380           vpackssdw(dst, src, src, vlen_enc);
5381           vpacksswb(dst, dst, dst, vlen_enc);
5382         } else {
5383           vpackssdw(dst, src, src, vlen_enc);
5384           vpermq(dst, dst, 0x08, vlen_enc);
5385           vpacksswb(dst, dst, dst, AVX_128bit);
5386         }
5387         break;
5388       }
5389       case 8: {
5390         if (vlen_enc == AVX_128bit) {
5391           vpshufd(dst, src, 0x08, vlen_enc);
5392           vpackssdw(dst, dst, dst, vlen_enc);
5393           vpacksswb(dst, dst, dst, vlen_enc);
5394         } else {
5395           vpshufd(dst, src, 0x08, vlen_enc);
5396           vpermq(dst, dst, 0x08, vlen_enc);
5397           vpackssdw(dst, dst, dst, AVX_128bit);
5398           vpacksswb(dst, dst, dst, AVX_128bit);
5399         }
5400         break;
5401       }
5402       default: ShouldNotReachHere();
5403     }
5404   }
5405 }
5406 
5407 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5408                                    bool merge, BasicType bt, int vlen_enc) {
5409   if (bt == T_INT) {
5410     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5411   } else {
5412     assert(bt == T_LONG, "");
5413     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5414   }
5415 }
5416 
5417 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5418                                    bool merge, BasicType bt, int vlen_enc) {
5419   if (bt == T_INT) {
5420     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5421   } else {
5422     assert(bt == T_LONG, "");
5423     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5424   }
5425 }
5426 
5427 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5428                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5429                                                int vec_enc) {
5430   int index = 0;
5431   int vindex = 0;
5432   mov64(rtmp1, 0x0101010101010101L);
5433   pdepq(rtmp1, src, rtmp1);
5434   if (mask_len > 8) {
5435     movq(rtmp2, src);
5436     vpxor(xtmp, xtmp, xtmp, vec_enc);
5437     movq(xtmp, rtmp1);
5438   }
5439   movq(dst, rtmp1);
5440 
5441   mask_len -= 8;
5442   while (mask_len > 0) {
5443     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5444     index++;
5445     if ((index % 2) == 0) {
5446       pxor(xtmp, xtmp);
5447     }
5448     mov64(rtmp1, 0x0101010101010101L);
5449     shrq(rtmp2, 8);
5450     pdepq(rtmp1, rtmp2, rtmp1);
5451     pinsrq(xtmp, rtmp1, index % 2);
5452     vindex = index / 2;
5453     if (vindex) {
5454       // Write entire 16 byte vector when both 64 bit
5455       // lanes are update to save redundant instructions.
5456       if (index % 2) {
5457         vinsertf128(dst, dst, xtmp, vindex);
5458       }
5459     } else {
5460       vmovdqu(dst, xtmp);
5461     }
5462     mask_len -= 8;
5463   }
5464 }
5465 
5466 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5467   switch(opc) {
5468     case Op_VectorMaskTrueCount:
5469       popcntq(dst, tmp);
5470       break;
5471     case Op_VectorMaskLastTrue:
5472       if (VM_Version::supports_lzcnt()) {
5473         lzcntq(tmp, tmp);
5474         movl(dst, 63);
5475         subl(dst, tmp);
5476       } else {
5477         movl(dst, -1);
5478         bsrq(tmp, tmp);
5479         cmov32(Assembler::notZero, dst, tmp);
5480       }
5481       break;
5482     case Op_VectorMaskFirstTrue:
5483       if (VM_Version::supports_bmi1()) {
5484         if (masklen < 32) {
5485           orl(tmp, 1 << masklen);
5486           tzcntl(dst, tmp);
5487         } else if (masklen == 32) {
5488           tzcntl(dst, tmp);
5489         } else {
5490           assert(masklen == 64, "");
5491           tzcntq(dst, tmp);
5492         }
5493       } else {
5494         if (masklen < 32) {
5495           orl(tmp, 1 << masklen);
5496           bsfl(dst, tmp);
5497         } else {
5498           assert(masklen == 32 || masklen == 64, "");
5499           movl(dst, masklen);
5500           if (masklen == 32)  {
5501             bsfl(tmp, tmp);
5502           } else {
5503             bsfq(tmp, tmp);
5504           }
5505           cmov32(Assembler::notZero, dst, tmp);
5506         }
5507       }
5508       break;
5509     case Op_VectorMaskToLong:
5510       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5511       break;
5512     default: assert(false, "Unhandled mask operation");
5513   }
5514 }
5515 
5516 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5517                                               int masklen, int masksize, int vec_enc) {
5518   assert(VM_Version::supports_popcnt(), "");
5519 
5520   if(VM_Version::supports_avx512bw()) {
5521     kmovql(tmp, mask);
5522   } else {
5523     assert(masklen <= 16, "");
5524     kmovwl(tmp, mask);
5525   }
5526 
5527   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5528   // operations needs to be clipped.
5529   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5530     andq(tmp, (1 << masklen) - 1);
5531   }
5532 
5533   vector_mask_operation_helper(opc, dst, tmp, masklen);
5534 }
5535 
5536 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5537                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5538   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5539          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5540   assert(VM_Version::supports_popcnt(), "");
5541 
5542   bool need_clip = false;
5543   switch(bt) {
5544     case T_BOOLEAN:
5545       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5546       vpxor(xtmp, xtmp, xtmp, vec_enc);
5547       vpsubb(xtmp, xtmp, mask, vec_enc);
5548       vpmovmskb(tmp, xtmp, vec_enc);
5549       need_clip = masklen < 16;
5550       break;
5551     case T_BYTE:
5552       vpmovmskb(tmp, mask, vec_enc);
5553       need_clip = masklen < 16;
5554       break;
5555     case T_SHORT:
5556       vpacksswb(xtmp, mask, mask, vec_enc);
5557       if (masklen >= 16) {
5558         vpermpd(xtmp, xtmp, 8, vec_enc);
5559       }
5560       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5561       need_clip = masklen < 16;
5562       break;
5563     case T_INT:
5564     case T_FLOAT:
5565       vmovmskps(tmp, mask, vec_enc);
5566       need_clip = masklen < 4;
5567       break;
5568     case T_LONG:
5569     case T_DOUBLE:
5570       vmovmskpd(tmp, mask, vec_enc);
5571       need_clip = masklen < 2;
5572       break;
5573     default: assert(false, "Unhandled type, %s", type2name(bt));
5574   }
5575 
5576   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5577   // operations needs to be clipped.
5578   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5579     // need_clip implies masklen < 32
5580     andq(tmp, (1 << masklen) - 1);
5581   }
5582 
5583   vector_mask_operation_helper(opc, dst, tmp, masklen);
5584 }
5585 
5586 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5587                                              Register rtmp2, int mask_len) {
5588   kmov(rtmp1, src);
5589   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5590   mov64(rtmp2, -1L);
5591   pextq(rtmp2, rtmp2, rtmp1);
5592   kmov(dst, rtmp2);
5593 }
5594 
5595 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5596                                                     XMMRegister mask, Register rtmp, Register rscratch,
5597                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5598                                                     int vec_enc) {
5599   assert(type2aelembytes(bt) >= 4, "");
5600   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5601   address compress_perm_table = nullptr;
5602   address expand_perm_table = nullptr;
5603   if (type2aelembytes(bt) == 8) {
5604     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5605     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5606     vmovmskpd(rtmp, mask, vec_enc);
5607   } else {
5608     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5609     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5610     vmovmskps(rtmp, mask, vec_enc);
5611   }
5612   shlq(rtmp, 5); // for 32 byte permute row.
5613   if (opcode == Op_CompressV) {
5614     lea(rscratch, ExternalAddress(compress_perm_table));
5615   } else {
5616     lea(rscratch, ExternalAddress(expand_perm_table));
5617   }
5618   addptr(rtmp, rscratch);
5619   vmovdqu(permv, Address(rtmp));
5620   vpermps(dst, permv, src, Assembler::AVX_256bit);
5621   vpxor(xtmp, xtmp, xtmp, vec_enc);
5622   // Blend the result with zero vector using permute mask, each column entry
5623   // in a permute table row contains either a valid permute index or a -1 (default)
5624   // value, this can potentially be used as a blending mask after
5625   // compressing/expanding the source vector lanes.
5626   vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv);
5627 }
5628 
5629 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5630                                                bool merge, BasicType bt, int vec_enc) {
5631   if (opcode == Op_CompressV) {
5632     switch(bt) {
5633     case T_BYTE:
5634       evpcompressb(dst, mask, src, merge, vec_enc);
5635       break;
5636     case T_CHAR:
5637     case T_SHORT:
5638       evpcompressw(dst, mask, src, merge, vec_enc);
5639       break;
5640     case T_INT:
5641       evpcompressd(dst, mask, src, merge, vec_enc);
5642       break;
5643     case T_FLOAT:
5644       evcompressps(dst, mask, src, merge, vec_enc);
5645       break;
5646     case T_LONG:
5647       evpcompressq(dst, mask, src, merge, vec_enc);
5648       break;
5649     case T_DOUBLE:
5650       evcompresspd(dst, mask, src, merge, vec_enc);
5651       break;
5652     default:
5653       fatal("Unsupported type %s", type2name(bt));
5654       break;
5655     }
5656   } else {
5657     assert(opcode == Op_ExpandV, "");
5658     switch(bt) {
5659     case T_BYTE:
5660       evpexpandb(dst, mask, src, merge, vec_enc);
5661       break;
5662     case T_CHAR:
5663     case T_SHORT:
5664       evpexpandw(dst, mask, src, merge, vec_enc);
5665       break;
5666     case T_INT:
5667       evpexpandd(dst, mask, src, merge, vec_enc);
5668       break;
5669     case T_FLOAT:
5670       evexpandps(dst, mask, src, merge, vec_enc);
5671       break;
5672     case T_LONG:
5673       evpexpandq(dst, mask, src, merge, vec_enc);
5674       break;
5675     case T_DOUBLE:
5676       evexpandpd(dst, mask, src, merge, vec_enc);
5677       break;
5678     default:
5679       fatal("Unsupported type %s", type2name(bt));
5680       break;
5681     }
5682   }
5683 }
5684 
5685 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5686                                            KRegister ktmp1, int vec_enc) {
5687   if (opcode == Op_SignumVD) {
5688     vsubpd(dst, zero, one, vec_enc);
5689     // if src < 0 ? -1 : 1
5690     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5691     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5692     // if src == NaN, -0.0 or 0.0 return src.
5693     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5694     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5695   } else {
5696     assert(opcode == Op_SignumVF, "");
5697     vsubps(dst, zero, one, vec_enc);
5698     // if src < 0 ? -1 : 1
5699     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5700     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5701     // if src == NaN, -0.0 or 0.0 return src.
5702     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5703     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5704   }
5705 }
5706 
5707 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5708                                           XMMRegister xtmp1, int vec_enc) {
5709   if (opcode == Op_SignumVD) {
5710     vsubpd(dst, zero, one, vec_enc);
5711     // if src < 0 ? -1 : 1
5712     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5713     // if src == NaN, -0.0 or 0.0 return src.
5714     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5715     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5716   } else {
5717     assert(opcode == Op_SignumVF, "");
5718     vsubps(dst, zero, one, vec_enc);
5719     // if src < 0 ? -1 : 1
5720     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5721     // if src == NaN, -0.0 or 0.0 return src.
5722     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5723     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5724   }
5725 }
5726 
5727 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5728   if (VM_Version::supports_avx512bw()) {
5729     if (mask_len > 32) {
5730       kmovql(dst, src);
5731     } else {
5732       kmovdl(dst, src);
5733       if (mask_len != 32) {
5734         kshiftrdl(dst, dst, 32 - mask_len);
5735       }
5736     }
5737   } else {
5738     assert(mask_len <= 16, "");
5739     kmovwl(dst, src);
5740     if (mask_len != 16) {
5741       kshiftrwl(dst, dst, 16 - mask_len);
5742     }
5743   }
5744 }
5745 
5746 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5747   int lane_size = type2aelembytes(bt);
5748   if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5749       (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) {
5750     movptr(rtmp, imm32);
5751     switch(lane_size) {
5752       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5753       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5754       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5755       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5756       fatal("Unsupported lane size %d", lane_size);
5757       break;
5758     }
5759   } else {
5760     movptr(rtmp, imm32);
5761     movq(dst, rtmp);
5762     switch(lane_size) {
5763       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5764       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5765       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5766       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5767       fatal("Unsupported lane size %d", lane_size);
5768       break;
5769     }
5770   }
5771 }
5772 
5773 //
5774 // Following is lookup table based popcount computation algorithm:-
5775 //       Index   Bit set count
5776 //     [ 0000 ->   0,
5777 //       0001 ->   1,
5778 //       0010 ->   1,
5779 //       0011 ->   2,
5780 //       0100 ->   1,
5781 //       0101 ->   2,
5782 //       0110 ->   2,
5783 //       0111 ->   3,
5784 //       1000 ->   1,
5785 //       1001 ->   2,
5786 //       1010 ->   3,
5787 //       1011 ->   3,
5788 //       1100 ->   2,
5789 //       1101 ->   3,
5790 //       1111 ->   4 ]
5791 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5792 //     shuffle indices for lookup table access.
5793 //  b. Right shift each byte of vector lane by 4 positions.
5794 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5795 //     shuffle indices for lookup table access.
5796 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5797 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5798 //     count of all the bytes of a quadword.
5799 //  f. Perform step e. for upper 128bit vector lane.
5800 //  g. Pack the bitset count of quadwords back to double word.
5801 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5802 
5803 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5804                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5805   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5806   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5807   vpsrlw(dst, src, 4, vec_enc);
5808   vpand(dst, dst, xtmp1, vec_enc);
5809   vpand(xtmp1, src, xtmp1, vec_enc);
5810   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5811   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5812   vpshufb(dst, xtmp2, dst, vec_enc);
5813   vpaddb(dst, dst, xtmp1, vec_enc);
5814 }
5815 
5816 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5817                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5818   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5819   // Following code is as per steps e,f,g and h of above algorithm.
5820   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5821   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5822   vpsadbw(dst, dst, xtmp2, vec_enc);
5823   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5824   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5825   vpackuswb(dst, xtmp1, dst, vec_enc);
5826 }
5827 
5828 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5829                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5830   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5831   // Add the popcount of upper and lower bytes of word.
5832   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5833   vpsrlw(dst, xtmp1, 8, vec_enc);
5834   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5835   vpaddw(dst, dst, xtmp1, vec_enc);
5836 }
5837 
5838 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5839                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5840   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5841   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5842   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5843 }
5844 
5845 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5846                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5847   switch(bt) {
5848     case T_LONG:
5849       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5850       break;
5851     case T_INT:
5852       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5853       break;
5854     case T_CHAR:
5855     case T_SHORT:
5856       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5857       break;
5858     case T_BYTE:
5859     case T_BOOLEAN:
5860       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5861       break;
5862     default:
5863       fatal("Unsupported type %s", type2name(bt));
5864       break;
5865   }
5866 }
5867 
5868 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5869                                                       KRegister mask, bool merge, int vec_enc) {
5870   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5871   switch(bt) {
5872     case T_LONG:
5873       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5874       evpopcntq(dst, mask, src, merge, vec_enc);
5875       break;
5876     case T_INT:
5877       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5878       evpopcntd(dst, mask, src, merge, vec_enc);
5879       break;
5880     case T_CHAR:
5881     case T_SHORT:
5882       assert(VM_Version::supports_avx512_bitalg(), "");
5883       evpopcntw(dst, mask, src, merge, vec_enc);
5884       break;
5885     case T_BYTE:
5886     case T_BOOLEAN:
5887       assert(VM_Version::supports_avx512_bitalg(), "");
5888       evpopcntb(dst, mask, src, merge, vec_enc);
5889       break;
5890     default:
5891       fatal("Unsupported type %s", type2name(bt));
5892       break;
5893   }
5894 }
5895 
5896 // Bit reversal algorithm first reverses the bits of each byte followed by
5897 // a byte level reversal for multi-byte primitive types (short/int/long).
5898 // Algorithm performs a lookup table access to get reverse bit sequence
5899 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5900 // is obtained by swapping the reverse bit sequences of upper and lower
5901 // nibble of a byte.
5902 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5903                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5904   if (VM_Version::supports_avx512vlbw()) {
5905 
5906     // Get the reverse bit sequence of lower nibble of each byte.
5907     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5908     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5909     evpandq(dst, xtmp2, src, vec_enc);
5910     vpshufb(dst, xtmp1, dst, vec_enc);
5911     vpsllq(dst, dst, 4, vec_enc);
5912 
5913     // Get the reverse bit sequence of upper nibble of each byte.
5914     vpandn(xtmp2, xtmp2, src, vec_enc);
5915     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5916     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5917 
5918     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5919     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5920     evporq(xtmp2, dst, xtmp2, vec_enc);
5921     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5922 
5923   } else if(vec_enc == Assembler::AVX_512bit) {
5924     // Shift based bit reversal.
5925     assert(bt == T_LONG || bt == T_INT, "");
5926 
5927     // Swap lower and upper nibble of each byte.
5928     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5929 
5930     // Swap two least and most significant bits of each nibble.
5931     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5932 
5933     // Swap adjacent pair of bits.
5934     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5935     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5936 
5937     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5938     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5939   } else {
5940     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5941     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5942 
5943     // Get the reverse bit sequence of lower nibble of each byte.
5944     vpand(dst, xtmp2, src, vec_enc);
5945     vpshufb(dst, xtmp1, dst, vec_enc);
5946     vpsllq(dst, dst, 4, vec_enc);
5947 
5948     // Get the reverse bit sequence of upper nibble of each byte.
5949     vpandn(xtmp2, xtmp2, src, vec_enc);
5950     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5951     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5952 
5953     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5954     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5955     vpor(xtmp2, dst, xtmp2, vec_enc);
5956     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5957   }
5958 }
5959 
5960 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5961                                                 XMMRegister xtmp, Register rscratch) {
5962   assert(VM_Version::supports_gfni(), "");
5963   assert(rscratch != noreg || always_reachable(mask), "missing");
5964 
5965   // Galois field instruction based bit reversal based on following algorithm.
5966   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5967   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5968   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5969   vector_reverse_byte(bt, dst, xtmp, vec_enc);
5970 }
5971 
5972 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5973                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
5974   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5975   evpandq(dst, xtmp1, src, vec_enc);
5976   vpsllq(dst, dst, nbits, vec_enc);
5977   vpandn(xtmp1, xtmp1, src, vec_enc);
5978   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
5979   evporq(dst, dst, xtmp1, vec_enc);
5980 }
5981 
5982 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5983                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5984   // Shift based bit reversal.
5985   assert(VM_Version::supports_evex(), "");
5986   switch(bt) {
5987     case T_LONG:
5988       // Swap upper and lower double word of each quad word.
5989       evprorq(xtmp1, k0, src, 32, true, vec_enc);
5990       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
5991       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5992       break;
5993     case T_INT:
5994       // Swap upper and lower word of each double word.
5995       evprord(xtmp1, k0, src, 16, true, vec_enc);
5996       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5997       break;
5998     case T_CHAR:
5999     case T_SHORT:
6000       // Swap upper and lower byte of each word.
6001       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6002       break;
6003     case T_BYTE:
6004       evmovdquq(dst, k0, src, true, vec_enc);
6005       break;
6006     default:
6007       fatal("Unsupported type %s", type2name(bt));
6008       break;
6009   }
6010 }
6011 
6012 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6013   if (bt == T_BYTE) {
6014     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6015       evmovdquq(dst, k0, src, true, vec_enc);
6016     } else {
6017       vmovdqu(dst, src);
6018     }
6019     return;
6020   }
6021   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6022   // pre-computed shuffle indices.
6023   switch(bt) {
6024     case T_LONG:
6025       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6026       break;
6027     case T_INT:
6028       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6029       break;
6030     case T_CHAR:
6031     case T_SHORT:
6032       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6033       break;
6034     default:
6035       fatal("Unsupported type %s", type2name(bt));
6036       break;
6037   }
6038   vpshufb(dst, src, dst, vec_enc);
6039 }
6040 
6041 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6042                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6043                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6044   assert(is_integral_type(bt), "");
6045   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6046   assert(VM_Version::supports_avx512cd(), "");
6047   switch(bt) {
6048     case T_LONG:
6049       evplzcntq(dst, ktmp, src, merge, vec_enc);
6050       break;
6051     case T_INT:
6052       evplzcntd(dst, ktmp, src, merge, vec_enc);
6053       break;
6054     case T_SHORT:
6055       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6056       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6057       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6058       vpunpckhwd(dst, xtmp1, src, vec_enc);
6059       evplzcntd(dst, ktmp, dst, merge, vec_enc);
6060       vpackusdw(dst, xtmp2, dst, vec_enc);
6061       break;
6062     case T_BYTE:
6063       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6064       // accessing the lookup table.
6065       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6066       // accessing the lookup table.
6067       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6068       assert(VM_Version::supports_avx512bw(), "");
6069       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6070       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6071       vpand(xtmp2, dst, src, vec_enc);
6072       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6073       vpsrlw(xtmp3, src, 4, vec_enc);
6074       vpand(xtmp3, dst, xtmp3, vec_enc);
6075       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6076       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6077       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6078       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6079       break;
6080     default:
6081       fatal("Unsupported type %s", type2name(bt));
6082       break;
6083   }
6084 }
6085 
6086 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6087                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6088   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6089   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6090   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6091   // accessing the lookup table.
6092   vpand(dst, xtmp2, src, vec_enc);
6093   vpshufb(dst, xtmp1, dst, vec_enc);
6094   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6095   // accessing the lookup table.
6096   vpsrlw(xtmp3, src, 4, vec_enc);
6097   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6098   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6099   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6100   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6101   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6102   vpaddb(dst, dst, xtmp2, vec_enc);
6103   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6104 }
6105 
6106 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6107                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6108   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6109   // Add zero counts of lower byte and upper byte of a word if
6110   // upper byte holds a zero value.
6111   vpsrlw(xtmp3, src, 8, vec_enc);
6112   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6113   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6114   vpsllw(xtmp2, dst, 8, vec_enc);
6115   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6116   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6117   vpsrlw(dst, dst, 8, vec_enc);
6118 }
6119 
6120 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6121                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6122   // Since IEEE 754 floating point format represents mantissa in 1.0 format
6123   // hence biased exponent can be used to compute leading zero count as per
6124   // following formula:-
6125   // LZCNT = 31 - (biased_exp - 127)
6126   // Special handling has been introduced for Zero, Max_Int and -ve source values.
6127 
6128   // Broadcast 0xFF
6129   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6130   vpsrld(xtmp1, xtmp1, 24, vec_enc);
6131 
6132   // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher
6133   // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit
6134   // contributes to the leading number of zeros.
6135   vpsrld(xtmp2, src, 1, vec_enc);
6136   vpandn(xtmp3, xtmp2, src, vec_enc);
6137 
6138   // Extract biased exponent.
6139   vcvtdq2ps(dst, xtmp3, vec_enc);
6140   vpsrld(dst, dst, 23, vec_enc);
6141   vpand(dst, dst, xtmp1, vec_enc);
6142 
6143   // Broadcast 127.
6144   vpsrld(xtmp1, xtmp1, 1, vec_enc);
6145   // Exponent = biased_exp - 127
6146   vpsubd(dst, dst, xtmp1, vec_enc);
6147 
6148   // Exponent_plus_one = Exponent + 1
6149   vpsrld(xtmp3, xtmp1, 6, vec_enc);
6150   vpaddd(dst, dst, xtmp3, vec_enc);
6151 
6152   // Replace -ve exponent with zero, exponent is -ve when src
6153   // lane contains a zero value.
6154   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6155   vblendvps(dst, dst, xtmp2, dst, vec_enc);
6156 
6157   // Rematerialize broadcast 32.
6158   vpslld(xtmp1, xtmp3, 5, vec_enc);
6159   // Exponent is 32 if corresponding source lane contains max_int value.
6160   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
6161   // LZCNT = 32 - exponent_plus_one
6162   vpsubd(dst, xtmp1, dst, vec_enc);
6163 
6164   // Replace LZCNT with a value 1 if corresponding source lane
6165   // contains max_int value.
6166   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
6167 
6168   // Replace biased_exp with 0 if source lane value is less than zero.
6169   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6170   vblendvps(dst, dst, xtmp2, src, vec_enc);
6171 }
6172 
6173 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6174                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6175   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6176   // Add zero counts of lower word and upper word of a double word if
6177   // upper word holds a zero value.
6178   vpsrld(xtmp3, src, 16, vec_enc);
6179   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6180   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
6181   vpslld(xtmp2, dst, 16, vec_enc);
6182   vpaddd(xtmp2, xtmp2, dst, vec_enc);
6183   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6184   vpsrld(dst, dst, 16, vec_enc);
6185   // Add zero counts of lower doubleword and upper doubleword of a
6186   // quadword if upper doubleword holds a zero value.
6187   vpsrlq(xtmp3, src, 32, vec_enc);
6188   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
6189   vpsllq(xtmp2, dst, 32, vec_enc);
6190   vpaddq(xtmp2, xtmp2, dst, vec_enc);
6191   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6192   vpsrlq(dst, dst, 32, vec_enc);
6193 }
6194 
6195 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6196                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6197                                                        Register rtmp, int vec_enc) {
6198   assert(is_integral_type(bt), "unexpected type");
6199   assert(vec_enc < Assembler::AVX_512bit, "");
6200   switch(bt) {
6201     case T_LONG:
6202       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6203       break;
6204     case T_INT:
6205       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6206       break;
6207     case T_SHORT:
6208       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6209       break;
6210     case T_BYTE:
6211       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6212       break;
6213     default:
6214       fatal("Unsupported type %s", type2name(bt));
6215       break;
6216   }
6217 }
6218 
6219 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6220   switch(bt) {
6221     case T_BYTE:
6222       vpsubb(dst, src1, src2, vec_enc);
6223       break;
6224     case T_SHORT:
6225       vpsubw(dst, src1, src2, vec_enc);
6226       break;
6227     case T_INT:
6228       vpsubd(dst, src1, src2, vec_enc);
6229       break;
6230     case T_LONG:
6231       vpsubq(dst, src1, src2, vec_enc);
6232       break;
6233     default:
6234       fatal("Unsupported type %s", type2name(bt));
6235       break;
6236   }
6237 }
6238 
6239 // Trailing zero count computation is based on leading zero count operation as per
6240 // following equation. All AVX3 targets support AVX512CD feature which offers
6241 // direct vector instruction to compute leading zero count.
6242 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6243 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6244                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6245                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6246   assert(is_integral_type(bt), "");
6247   // xtmp = -1
6248   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6249   // xtmp = xtmp + src
6250   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6251   // xtmp = xtmp & ~src
6252   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6253   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6254   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6255   vpsub(bt, dst, xtmp4, dst, vec_enc);
6256 }
6257 
6258 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6259 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6260 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6261                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6262   assert(is_integral_type(bt), "");
6263   // xtmp = 0
6264   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6265   // xtmp = 0 - src
6266   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6267   // xtmp = xtmp | src
6268   vpor(xtmp3, xtmp3, src, vec_enc);
6269   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6270   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6271   vpsub(bt, dst, xtmp1, dst, vec_enc);
6272 }
6273 
6274 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6275   Label done;
6276   Label neg_divisor_fastpath;
6277   cmpl(divisor, 0);
6278   jccb(Assembler::less, neg_divisor_fastpath);
6279   xorl(rdx, rdx);
6280   divl(divisor);
6281   jmpb(done);
6282   bind(neg_divisor_fastpath);
6283   // Fastpath for divisor < 0:
6284   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6285   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6286   movl(rdx, rax);
6287   subl(rdx, divisor);
6288   if (VM_Version::supports_bmi1()) {
6289     andnl(rax, rdx, rax);
6290   } else {
6291     notl(rdx);
6292     andl(rax, rdx);
6293   }
6294   shrl(rax, 31);
6295   bind(done);
6296 }
6297 
6298 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6299   Label done;
6300   Label neg_divisor_fastpath;
6301   cmpl(divisor, 0);
6302   jccb(Assembler::less, neg_divisor_fastpath);
6303   xorl(rdx, rdx);
6304   divl(divisor);
6305   jmpb(done);
6306   bind(neg_divisor_fastpath);
6307   // Fastpath when divisor < 0:
6308   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6309   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6310   movl(rdx, rax);
6311   subl(rax, divisor);
6312   if (VM_Version::supports_bmi1()) {
6313     andnl(rax, rax, rdx);
6314   } else {
6315     notl(rax);
6316     andl(rax, rdx);
6317   }
6318   sarl(rax, 31);
6319   andl(rax, divisor);
6320   subl(rdx, rax);
6321   bind(done);
6322 }
6323 
6324 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6325   Label done;
6326   Label neg_divisor_fastpath;
6327 
6328   cmpl(divisor, 0);
6329   jccb(Assembler::less, neg_divisor_fastpath);
6330   xorl(rdx, rdx);
6331   divl(divisor);
6332   jmpb(done);
6333   bind(neg_divisor_fastpath);
6334   // Fastpath for divisor < 0:
6335   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6336   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6337   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6338   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6339   movl(rdx, rax);
6340   subl(rax, divisor);
6341   if (VM_Version::supports_bmi1()) {
6342     andnl(rax, rax, rdx);
6343   } else {
6344     notl(rax);
6345     andl(rax, rdx);
6346   }
6347   movl(tmp, rax);
6348   shrl(rax, 31); // quotient
6349   sarl(tmp, 31);
6350   andl(tmp, divisor);
6351   subl(rdx, tmp); // remainder
6352   bind(done);
6353 }
6354 
6355 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6356                                  XMMRegister xtmp2, Register rtmp) {
6357   if(VM_Version::supports_gfni()) {
6358     // Galois field instruction based bit reversal based on following algorithm.
6359     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6360     mov64(rtmp, 0x8040201008040201L);
6361     movq(xtmp1, src);
6362     movq(xtmp2, rtmp);
6363     gf2p8affineqb(xtmp1, xtmp2, 0);
6364     movq(dst, xtmp1);
6365   } else {
6366     // Swap even and odd numbered bits.
6367     movl(rtmp, src);
6368     andl(rtmp, 0x55555555);
6369     shll(rtmp, 1);
6370     movl(dst, src);
6371     andl(dst, 0xAAAAAAAA);
6372     shrl(dst, 1);
6373     orl(dst, rtmp);
6374 
6375     // Swap LSB and MSB 2 bits of each nibble.
6376     movl(rtmp, dst);
6377     andl(rtmp, 0x33333333);
6378     shll(rtmp, 2);
6379     andl(dst, 0xCCCCCCCC);
6380     shrl(dst, 2);
6381     orl(dst, rtmp);
6382 
6383     // Swap LSB and MSB 4 bits of each byte.
6384     movl(rtmp, dst);
6385     andl(rtmp, 0x0F0F0F0F);
6386     shll(rtmp, 4);
6387     andl(dst, 0xF0F0F0F0);
6388     shrl(dst, 4);
6389     orl(dst, rtmp);
6390   }
6391   bswapl(dst);
6392 }
6393 
6394 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6395                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6396   if(VM_Version::supports_gfni()) {
6397     // Galois field instruction based bit reversal based on following algorithm.
6398     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6399     mov64(rtmp1, 0x8040201008040201L);
6400     movq(xtmp1, src);
6401     movq(xtmp2, rtmp1);
6402     gf2p8affineqb(xtmp1, xtmp2, 0);
6403     movq(dst, xtmp1);
6404   } else {
6405     // Swap even and odd numbered bits.
6406     movq(rtmp1, src);
6407     mov64(rtmp2, 0x5555555555555555L);
6408     andq(rtmp1, rtmp2);
6409     shlq(rtmp1, 1);
6410     movq(dst, src);
6411     notq(rtmp2);
6412     andq(dst, rtmp2);
6413     shrq(dst, 1);
6414     orq(dst, rtmp1);
6415 
6416     // Swap LSB and MSB 2 bits of each nibble.
6417     movq(rtmp1, dst);
6418     mov64(rtmp2, 0x3333333333333333L);
6419     andq(rtmp1, rtmp2);
6420     shlq(rtmp1, 2);
6421     notq(rtmp2);
6422     andq(dst, rtmp2);
6423     shrq(dst, 2);
6424     orq(dst, rtmp1);
6425 
6426     // Swap LSB and MSB 4 bits of each byte.
6427     movq(rtmp1, dst);
6428     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6429     andq(rtmp1, rtmp2);
6430     shlq(rtmp1, 4);
6431     notq(rtmp2);
6432     andq(dst, rtmp2);
6433     shrq(dst, 4);
6434     orq(dst, rtmp1);
6435   }
6436   bswapq(dst);
6437 }
6438 
6439 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6440   Label done;
6441   Label neg_divisor_fastpath;
6442   cmpq(divisor, 0);
6443   jccb(Assembler::less, neg_divisor_fastpath);
6444   xorl(rdx, rdx);
6445   divq(divisor);
6446   jmpb(done);
6447   bind(neg_divisor_fastpath);
6448   // Fastpath for divisor < 0:
6449   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6450   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6451   movq(rdx, rax);
6452   subq(rdx, divisor);
6453   if (VM_Version::supports_bmi1()) {
6454     andnq(rax, rdx, rax);
6455   } else {
6456     notq(rdx);
6457     andq(rax, rdx);
6458   }
6459   shrq(rax, 63);
6460   bind(done);
6461 }
6462 
6463 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6464   Label done;
6465   Label neg_divisor_fastpath;
6466   cmpq(divisor, 0);
6467   jccb(Assembler::less, neg_divisor_fastpath);
6468   xorq(rdx, rdx);
6469   divq(divisor);
6470   jmp(done);
6471   bind(neg_divisor_fastpath);
6472   // Fastpath when divisor < 0:
6473   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6474   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6475   movq(rdx, rax);
6476   subq(rax, divisor);
6477   if (VM_Version::supports_bmi1()) {
6478     andnq(rax, rax, rdx);
6479   } else {
6480     notq(rax);
6481     andq(rax, rdx);
6482   }
6483   sarq(rax, 63);
6484   andq(rax, divisor);
6485   subq(rdx, rax);
6486   bind(done);
6487 }
6488 
6489 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6490   Label done;
6491   Label neg_divisor_fastpath;
6492   cmpq(divisor, 0);
6493   jccb(Assembler::less, neg_divisor_fastpath);
6494   xorq(rdx, rdx);
6495   divq(divisor);
6496   jmp(done);
6497   bind(neg_divisor_fastpath);
6498   // Fastpath for divisor < 0:
6499   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6500   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6501   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6502   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6503   movq(rdx, rax);
6504   subq(rax, divisor);
6505   if (VM_Version::supports_bmi1()) {
6506     andnq(rax, rax, rdx);
6507   } else {
6508     notq(rax);
6509     andq(rax, rdx);
6510   }
6511   movq(tmp, rax);
6512   shrq(rax, 63); // quotient
6513   sarq(tmp, 63);
6514   andq(tmp, divisor);
6515   subq(rdx, tmp); // remainder
6516   bind(done);
6517 }
6518 
6519 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6520                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6521                                         int vlen_enc) {
6522   assert(VM_Version::supports_avx512bw(), "");
6523   // Byte shuffles are inlane operations and indices are determined using
6524   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6525   // normalized to index range 0-15. This makes sure that all the multiples
6526   // of an index value are placed at same relative position in 128 bit
6527   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6528   // will be 16th element in their respective 128 bit lanes.
6529   movl(rtmp, 16);
6530   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6531 
6532   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6533   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6534   // original shuffle indices and move the shuffled lanes corresponding to true
6535   // mask to destination vector.
6536   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6537   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6538   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6539 
6540   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6541   // and broadcasting second 128 bit lane.
6542   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6543   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6544   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6545   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6546   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6547 
6548   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6549   // and broadcasting third 128 bit lane.
6550   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6551   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6552   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6553   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6554   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6555 
6556   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6557   // and broadcasting third 128 bit lane.
6558   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6559   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6560   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6561   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6562   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6563 }
6564 
6565 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6566                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6567   if (vlen_enc == AVX_128bit) {
6568     vpermilps(dst, src, shuffle, vlen_enc);
6569   } else if (bt == T_INT) {
6570     vpermd(dst, shuffle, src, vlen_enc);
6571   } else {
6572     assert(bt == T_FLOAT, "");
6573     vpermps(dst, shuffle, src, vlen_enc);
6574   }
6575 }
6576 
6577 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6578   switch(opcode) {
6579     case Op_AddHF: vaddsh(dst, src1, src2); break;
6580     case Op_SubHF: vsubsh(dst, src1, src2); break;
6581     case Op_MulHF: vmulsh(dst, src1, src2); break;
6582     case Op_DivHF: vdivsh(dst, src1, src2); break;
6583     default: assert(false, "%s", NodeClassNames[opcode]); break;
6584   }
6585 }
6586 
6587 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6588   switch(elem_bt) {
6589     case T_BYTE:
6590       if (ideal_opc == Op_SaturatingAddV) {
6591         vpaddsb(dst, src1, src2, vlen_enc);
6592       } else {
6593         assert(ideal_opc == Op_SaturatingSubV, "");
6594         vpsubsb(dst, src1, src2, vlen_enc);
6595       }
6596       break;
6597     case T_SHORT:
6598       if (ideal_opc == Op_SaturatingAddV) {
6599         vpaddsw(dst, src1, src2, vlen_enc);
6600       } else {
6601         assert(ideal_opc == Op_SaturatingSubV, "");
6602         vpsubsw(dst, src1, src2, vlen_enc);
6603       }
6604       break;
6605     default:
6606       fatal("Unsupported type %s", type2name(elem_bt));
6607       break;
6608   }
6609 }
6610 
6611 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6612   switch(elem_bt) {
6613     case T_BYTE:
6614       if (ideal_opc == Op_SaturatingAddV) {
6615         vpaddusb(dst, src1, src2, vlen_enc);
6616       } else {
6617         assert(ideal_opc == Op_SaturatingSubV, "");
6618         vpsubusb(dst, src1, src2, vlen_enc);
6619       }
6620       break;
6621     case T_SHORT:
6622       if (ideal_opc == Op_SaturatingAddV) {
6623         vpaddusw(dst, src1, src2, vlen_enc);
6624       } else {
6625         assert(ideal_opc == Op_SaturatingSubV, "");
6626         vpsubusw(dst, src1, src2, vlen_enc);
6627       }
6628       break;
6629     default:
6630       fatal("Unsupported type %s", type2name(elem_bt));
6631       break;
6632   }
6633 }
6634 
6635 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6636                                                               XMMRegister src2, KRegister ktmp, int vlen_enc) {
6637   // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6638   // overflow_mask = Inp1 <u Inp2
6639   evpcmpu(elem_bt, ktmp,  src2, src1, Assembler::lt, vlen_enc);
6640   // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6641   evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6642 }
6643 
6644 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6645                                                               XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6646   // Emulate unsigned comparison using signed comparison
6647   // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6648   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6649   vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6650   vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6651 
6652   vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6653 
6654   // Res = INP1 - INP2 (non-commutative and non-associative)
6655   vpsub(elem_bt, dst, src1, src2, vlen_enc);
6656   // Res = Mask ? Zero : Res
6657   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6658   vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6659 }
6660 
6661 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6662                                                                XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6663   // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6664   // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6665   // Res = Signed Add INP1, INP2
6666   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6667   // T1 = SRC1 | SRC2
6668   vpor(xtmp1, src1, src2, vlen_enc);
6669   // Max_Unsigned = -1
6670   vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6671   // Unsigned compare:  Mask = Res <u T1
6672   evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6673   // res  = Mask ? Max_Unsigned : Res
6674   evpblend(elem_bt, dst, ktmp,  dst, xtmp2, true, vlen_enc);
6675 }
6676 
6677 //
6678 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6679 // unsigned addition operation.
6680 //    overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6681 //
6682 // We empirically determined its semantic equivalence to following reduced expression
6683 //    overflow_mask =  (a + b) <u (a | b)
6684 //
6685 // and also verified it though Alive2 solver.
6686 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6687 //
6688 
6689 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6690                                                               XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6691   // Res = Signed Add INP1, INP2
6692   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6693   // Compute T1 = INP1 | INP2
6694   vpor(xtmp3, src1, src2, vlen_enc);
6695   // T1 = Minimum signed value.
6696   vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6697   // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6698   vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6699   // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6700   vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6701   // Compute overflow detection mask = Res<1> <s T1
6702   if (elem_bt == T_INT) {
6703     vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6704   } else {
6705     assert(elem_bt == T_LONG, "");
6706     vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6707   }
6708   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6709 }
6710 
6711 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6712                                       int vlen_enc, bool xtmp2_hold_M1) {
6713   if (VM_Version::supports_avx512dq()) {
6714     evpmovq2m(ktmp, src, vlen_enc);
6715   } else {
6716     assert(VM_Version::supports_evex(), "");
6717     if (!xtmp2_hold_M1) {
6718       vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6719     }
6720     evpsraq(xtmp1, src, 63, vlen_enc);
6721     evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6722   }
6723 }
6724 
6725 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6726                                       int vlen_enc, bool xtmp2_hold_M1) {
6727   if (VM_Version::supports_avx512dq()) {
6728     evpmovd2m(ktmp, src, vlen_enc);
6729   } else {
6730     assert(VM_Version::supports_evex(), "");
6731     if (!xtmp2_hold_M1) {
6732       vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6733     }
6734     vpsrad(xtmp1, src, 31, vlen_enc);
6735     Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6736   }
6737 }
6738 
6739 
6740 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6741   if (elem_bt == T_LONG) {
6742     if (VM_Version::supports_evex()) {
6743       evpsraq(dst, src, 63, vlen_enc);
6744     } else {
6745       vpsrad(dst, src, 31, vlen_enc);
6746       vpshufd(dst, dst, 0xF5, vlen_enc);
6747     }
6748   } else {
6749     assert(elem_bt == T_INT, "");
6750     vpsrad(dst, src, 31, vlen_enc);
6751   }
6752 }
6753 
6754 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6755   if (compute_allones) {
6756     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6757       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6758     } else {
6759       vpcmpeqq(allones, allones, allones, vlen_enc);
6760     }
6761   }
6762   if (elem_bt == T_LONG) {
6763     vpsrlq(dst, allones, 1, vlen_enc);
6764   } else {
6765     assert(elem_bt == T_INT, "");
6766     vpsrld(dst, allones, 1, vlen_enc);
6767   }
6768 }
6769 
6770 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6771   if (compute_allones) {
6772     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6773       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6774     } else {
6775       vpcmpeqq(allones, allones, allones, vlen_enc);
6776     }
6777   }
6778   if (elem_bt == T_LONG) {
6779     vpsllq(dst, allones, 63, vlen_enc);
6780   } else {
6781     assert(elem_bt == T_INT, "");
6782     vpslld(dst, allones, 31, vlen_enc);
6783   }
6784 }
6785 
6786 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask,  XMMRegister src1, XMMRegister src2,
6787                                 Assembler::ComparisonPredicate cond, int vlen_enc) {
6788   switch(elem_bt) {
6789     case T_LONG:  evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6790     case T_INT:   evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6791     case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6792     case T_BYTE:  evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6793     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6794   }
6795 }
6796 
6797 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6798   switch(elem_bt) {
6799     case  T_LONG:  vpcmpgtq(dst, src1, src2, vlen_enc); break;
6800     case  T_INT:   vpcmpgtd(dst, src1, src2, vlen_enc); break;
6801     case  T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6802     case  T_BYTE:  vpcmpgtb(dst, src1, src2, vlen_enc); break;
6803     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6804   }
6805 }
6806 
6807 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6808                                            XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6809   if (elem_bt == T_LONG) {
6810     evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6811   } else {
6812     assert(elem_bt == T_INT, "");
6813     evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6814   }
6815 }
6816 
6817 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6818                                                          XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6819                                                          KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6820   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6821   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6822   // Overflow detection based on Hacker's delight section 2-13.
6823   if (ideal_opc == Op_SaturatingAddV) {
6824     // res = src1 + src2
6825     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6826     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6827     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6828     vpxor(xtmp1, dst, src1, vlen_enc);
6829     vpxor(xtmp2, dst, src2, vlen_enc);
6830     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6831   } else {
6832     assert(ideal_opc == Op_SaturatingSubV, "");
6833     // res = src1 - src2
6834     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6835     // Overflow occurs when both inputs have opposite polarity and
6836     // result polarity does not comply with first input polarity.
6837     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6838     vpxor(xtmp1, src1, src2, vlen_enc);
6839     vpxor(xtmp2, dst, src1, vlen_enc);
6840     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6841   }
6842 
6843   // Compute overflow detection mask.
6844   evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6845   // Note: xtmp1 hold -1 in all its lanes after above call.
6846 
6847   // Compute mask based on first input polarity.
6848   evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6849 
6850   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6851   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6852 
6853   // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6854   // set bits in first input polarity mask holds a min value.
6855   evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6856   // Blend destination lanes with saturated values using overflow detection mask.
6857   evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6858 }
6859 
6860 
6861 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6862                                                         XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6863                                                         XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6864   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6865   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6866   // Overflow detection based on Hacker's delight section 2-13.
6867   if (ideal_opc == Op_SaturatingAddV) {
6868     // res = src1 + src2
6869     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6870     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6871     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6872     vpxor(xtmp1, dst, src1, vlen_enc);
6873     vpxor(xtmp2, dst, src2, vlen_enc);
6874     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6875   } else {
6876     assert(ideal_opc == Op_SaturatingSubV, "");
6877     // res = src1 - src2
6878     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6879     // Overflow occurs when both inputs have opposite polarity and
6880     // result polarity does not comply with first input polarity.
6881     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6882     vpxor(xtmp1, src1, src2, vlen_enc);
6883     vpxor(xtmp2, dst, src1, vlen_enc);
6884     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6885   }
6886 
6887   // Sign-extend to compute overflow detection mask.
6888   vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
6889 
6890   vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
6891   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
6892   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6893 
6894   // Compose saturating min/max vector using first input polarity mask.
6895   vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
6896   vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
6897 
6898   // Blend result with saturating vector using overflow detection mask.
6899   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6900 }
6901 
6902 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6903   switch(elem_bt) {
6904     case T_BYTE:
6905       if (ideal_opc == Op_SaturatingAddV) {
6906         vpaddsb(dst, src1, src2, vlen_enc);
6907       } else {
6908         assert(ideal_opc == Op_SaturatingSubV, "");
6909         vpsubsb(dst, src1, src2, vlen_enc);
6910       }
6911       break;
6912     case T_SHORT:
6913       if (ideal_opc == Op_SaturatingAddV) {
6914         vpaddsw(dst, src1, src2, vlen_enc);
6915       } else {
6916         assert(ideal_opc == Op_SaturatingSubV, "");
6917         vpsubsw(dst, src1, src2, vlen_enc);
6918       }
6919       break;
6920     default:
6921       fatal("Unsupported type %s", type2name(elem_bt));
6922       break;
6923   }
6924 }
6925 
6926 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6927   switch(elem_bt) {
6928     case T_BYTE:
6929       if (ideal_opc == Op_SaturatingAddV) {
6930         vpaddusb(dst, src1, src2, vlen_enc);
6931       } else {
6932         assert(ideal_opc == Op_SaturatingSubV, "");
6933         vpsubusb(dst, src1, src2, vlen_enc);
6934       }
6935       break;
6936     case T_SHORT:
6937       if (ideal_opc == Op_SaturatingAddV) {
6938         vpaddusw(dst, src1, src2, vlen_enc);
6939       } else {
6940         assert(ideal_opc == Op_SaturatingSubV, "");
6941         vpsubusw(dst, src1, src2, vlen_enc);
6942       }
6943       break;
6944     default:
6945       fatal("Unsupported type %s", type2name(elem_bt));
6946       break;
6947   }
6948 }
6949 
6950 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6951                                                      XMMRegister src2, int vlen_enc) {
6952   switch(elem_bt) {
6953     case T_BYTE:
6954       evpermi2b(dst, src1, src2, vlen_enc);
6955       break;
6956     case T_SHORT:
6957       evpermi2w(dst, src1, src2, vlen_enc);
6958       break;
6959     case T_INT:
6960       evpermi2d(dst, src1, src2, vlen_enc);
6961       break;
6962     case T_LONG:
6963       evpermi2q(dst, src1, src2, vlen_enc);
6964       break;
6965     case T_FLOAT:
6966       evpermi2ps(dst, src1, src2, vlen_enc);
6967       break;
6968     case T_DOUBLE:
6969       evpermi2pd(dst, src1, src2, vlen_enc);
6970       break;
6971     default:
6972       fatal("Unsupported type %s", type2name(elem_bt));
6973       break;
6974   }
6975 }
6976 
6977 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
6978   if (is_unsigned) {
6979     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6980   } else {
6981     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6982   }
6983 }
6984 
6985 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
6986   if (is_unsigned) {
6987     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6988   } else {
6989     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6990   }
6991 }
6992 
6993 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6994   switch(opcode) {
6995     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
6996     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
6997     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
6998     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
6999     default: assert(false, "%s", NodeClassNames[opcode]); break;
7000   }
7001 }
7002 
7003 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7004   switch(opcode) {
7005     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7006     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7007     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7008     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7009     default: assert(false, "%s", NodeClassNames[opcode]); break;
7010   }
7011 }
7012 
7013 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7014                                             KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) {
7015   vector_max_min_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit);
7016 }
7017 
7018 void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7019                                             KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
7020   if (opcode == Op_MaxVHF || opcode == Op_MaxHF) {
7021     // Move sign bits of src2 to mask register.
7022     evpmovw2m(ktmp, src2, vlen_enc);
7023     // xtmp1 = src2 < 0 ? src2 : src1
7024     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7025     // xtmp2 = src2 < 0 ? ? src1 : src2
7026     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7027     // Idea behind above swapping is to make seconds source operand a +ve value.
7028     // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in
7029     // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction,
7030     // the second source operand, either a NaN or a valid floating-point value, is returned
7031     // dst = max(xtmp1, xtmp2)
7032     evmaxph(dst, xtmp1, xtmp2, vlen_enc);
7033     // isNaN = is_unordered_quiet(xtmp1)
7034     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7035     // Final result is same as first source if its a NaN value,
7036     // in case second operand holds a NaN value then as per above semantics
7037     // result is same as second operand.
7038     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7039   } else {
7040     assert(opcode == Op_MinVHF || opcode == Op_MinHF, "");
7041     // Move sign bits of src1 to mask register.
7042     evpmovw2m(ktmp, src1, vlen_enc);
7043     // xtmp1 = src1 < 0 ? src2 : src1
7044     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7045     // xtmp2 = src1 < 0 ? src1 : src2
7046     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7047     // Idea behind above swapping is to make seconds source operand a -ve value.
7048     // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in
7049     // the second source operand is returned.
7050     // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN
7051     // or a valid floating-point value, is written to the result.
7052     // dst = min(xtmp1, xtmp2)
7053     evminph(dst, xtmp1, xtmp2, vlen_enc);
7054     // isNaN = is_unordered_quiet(xtmp1)
7055     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7056     // Final result is same as first source if its a NaN value,
7057     // in case second operand holds a NaN value then as per above semantics
7058     // result is same as second operand.
7059     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7060   }
7061 }