1 /*
   2  * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "gc/shared/barrierSet.hpp"
  28 #include "gc/shared/barrierSetAssembler.hpp"
  29 #include "oops/methodData.hpp"
  30 #include "opto/c2_MacroAssembler.hpp"
  31 #include "opto/intrinsicnode.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/opcodes.hpp"
  34 #include "opto/subnode.hpp"
  35 #include "runtime/globals.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 #include "utilities/checkedCast.hpp"
  39 #include "utilities/globalDefinitions.hpp"
  40 #include "utilities/powerOfTwo.hpp"
  41 #include "utilities/sizes.hpp"
  42 
  43 #ifdef PRODUCT
  44 #define BLOCK_COMMENT(str) /* nothing */
  45 #define STOP(error) stop(error)
  46 #else
  47 #define BLOCK_COMMENT(str) block_comment(str)
  48 #define STOP(error) block_comment(error); stop(error)
  49 #endif
  50 
  51 // C2 compiled method's prolog code.
  52 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) {
  53   if (C->clinit_barrier_on_entry()) {
  54     assert(VM_Version::supports_fast_class_init_checks(), "sanity");
  55     assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started");
  56 
  57     Label L_skip_barrier;
  58     Register klass = rscratch1;
  59 
  60     mov_metadata(klass, C->method()->holder()->constant_encoding());
  61     clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
  62 
  63     jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
  64 
  65     bind(L_skip_barrier);
  66   }
  67 
  68   int framesize = C->output()->frame_size_in_bytes();
  69   int bangsize = C->output()->bang_size_in_bytes();
  70   bool fp_mode_24b = false;
  71   int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0;
  72 
  73   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  74 
  75   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  76   // Remove word for return addr
  77   framesize -= wordSize;
  78   stack_bang_size -= wordSize;
  79 
  80   // Calls to C2R adapters often do not accept exceptional returns.
  81   // We require that their callers must bang for them.  But be careful, because
  82   // some VM calls (such as call site linkage) can use several kilobytes of
  83   // stack.  But the stack safety zone should account for that.
  84   // See bugs 4446381, 4468289, 4497237.
  85   if (stack_bang_size > 0) {
  86     generate_stack_overflow_check(stack_bang_size);
  87 
  88     // We always push rbp, so that on return to interpreter rbp, will be
  89     // restored correctly and we can correct the stack.
  90     push(rbp);
  91     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  92     if (PreserveFramePointer) {
  93       mov(rbp, rsp);
  94     }
  95     // Remove word for ebp
  96     framesize -= wordSize;
  97 
  98     // Create frame
  99     if (framesize) {
 100       subptr(rsp, framesize);
 101     }
 102   } else {
 103     subptr(rsp, framesize);
 104 
 105     // Save RBP register now.
 106     framesize -= wordSize;
 107     movptr(Address(rsp, framesize), rbp);
 108     // Save caller's stack pointer into RBP if the frame pointer is preserved.
 109     if (PreserveFramePointer) {
 110       movptr(rbp, rsp);
 111       if (framesize > 0) {
 112         addptr(rbp, framesize);
 113       }
 114     }
 115   }
 116 
 117   if (C->needs_stack_repair()) {
 118     // Save stack increment just below the saved rbp (also account for fixed framesize and rbp)
 119     assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned");
 120     movptr(Address(rsp, framesize - wordSize), sp_inc + framesize + wordSize);
 121   }
 122 
 123   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 124     framesize -= wordSize;
 125     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 126   }
 127 
 128 #ifdef ASSERT
 129   if (VerifyStackAtCalls) {
 130     Label L;
 131     push(rax);
 132     mov(rax, rsp);
 133     andptr(rax, StackAlignmentInBytes-1);
 134     cmpptr(rax, StackAlignmentInBytes-wordSize);
 135     pop(rax);
 136     jcc(Assembler::equal, L);
 137     STOP("Stack is not properly aligned!");
 138     bind(L);
 139   }
 140 #endif
 141 }
 142 
 143 void C2_MacroAssembler::entry_barrier() {
 144   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 145   // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 146   Label dummy_slow_path;
 147   Label dummy_continuation;
 148   Label* slow_path = &dummy_slow_path;
 149   Label* continuation = &dummy_continuation;
 150   if (!Compile::current()->output()->in_scratch_emit_size()) {
 151     // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 152     C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 153     Compile::current()->output()->add_stub(stub);
 154     slow_path = &stub->entry();
 155     continuation = &stub->continuation();
 156   }
 157   bs->nmethod_entry_barrier(this, slow_path, continuation);
 158 }
 159 
 160 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 161   switch (vlen_in_bytes) {
 162     case  4: // fall-through
 163     case  8: // fall-through
 164     case 16: return Assembler::AVX_128bit;
 165     case 32: return Assembler::AVX_256bit;
 166     case 64: return Assembler::AVX_512bit;
 167 
 168     default: {
 169       ShouldNotReachHere();
 170       return Assembler::AVX_NoVec;
 171     }
 172   }
 173 }
 174 
 175 // fast_lock and fast_unlock used by C2
 176 
 177 // Because the transitions from emitted code to the runtime
 178 // monitorenter/exit helper stubs are so slow it's critical that
 179 // we inline both the stack-locking fast path and the inflated fast path.
 180 //
 181 // See also: cmpFastLock and cmpFastUnlock.
 182 //
 183 // What follows is a specialized inline transliteration of the code
 184 // in enter() and exit(). If we're concerned about I$ bloat another
 185 // option would be to emit TrySlowEnter and TrySlowExit methods
 186 // at startup-time.  These methods would accept arguments as
 187 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 188 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 189 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 190 // In practice, however, the # of lock sites is bounded and is usually small.
 191 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 192 // if the processor uses simple bimodal branch predictors keyed by EIP
 193 // Since the helper routines would be called from multiple synchronization
 194 // sites.
 195 //
 196 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 197 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 198 // to those specialized methods.  That'd give us a mostly platform-independent
 199 // implementation that the JITs could optimize and inline at their pleasure.
 200 // Done correctly, the only time we'd need to cross to native could would be
 201 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 202 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 203 // (b) explicit barriers or fence operations.
 204 //
 205 // TODO:
 206 //
 207 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 208 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 209 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 210 //    the lock operators would typically be faster than reifying Self.
 211 //
 212 // *  Ideally I'd define the primitives as:
 213 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 214 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 215 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 216 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 217 //    Furthermore the register assignments are overconstrained, possibly resulting in
 218 //    sub-optimal code near the synchronization site.
 219 //
 220 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 221 //    Alternately, use a better sp-proximity test.
 222 //
 223 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 224 //    Either one is sufficient to uniquely identify a thread.
 225 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 226 //
 227 // *  Intrinsify notify() and notifyAll() for the common cases where the
 228 //    object is locked by the calling thread but the waitlist is empty.
 229 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 230 //
 231 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 232 //    But beware of excessive branch density on AMD Opterons.
 233 //
 234 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 235 //    or failure of the fast path.  If the fast path fails then we pass
 236 //    control to the slow path, typically in C.  In fast_lock and
 237 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 238 //    will emit a conditional branch immediately after the node.
 239 //    So we have branches to branches and lots of ICC.ZF games.
 240 //    Instead, it might be better to have C2 pass a "FailureLabel"
 241 //    into fast_lock and fast_unlock.  In the case of success, control
 242 //    will drop through the node.  ICC.ZF is undefined at exit.
 243 //    In the case of failure, the node will branch directly to the
 244 //    FailureLabel
 245 
 246 
 247 // obj: object to lock
 248 // box: on-stack box address -- KILLED
 249 // rax: tmp -- KILLED
 250 // t  : tmp -- KILLED
 251 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
 252                                               Register t, Register thread) {
 253   assert(rax_reg == rax, "Used for CAS");
 254   assert_different_registers(obj, box, rax_reg, t, thread);
 255 
 256   // Handle inflated monitor.
 257   Label inflated;
 258   // Finish fast lock successfully. ZF value is irrelevant.
 259   Label locked;
 260   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 261   Label slow_path;
 262 
 263   if (UseObjectMonitorTable) {
 264     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 265     movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
 266   }
 267 
 268   if (DiagnoseSyncOnValueBasedClasses != 0) {
 269     load_klass(rax_reg, obj, t);
 270     testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 271     jcc(Assembler::notZero, slow_path);
 272   }
 273 
 274   const Register mark = t;
 275 
 276   { // Lightweight Lock
 277 
 278     Label push;
 279 
 280     const Register top = UseObjectMonitorTable ? rax_reg : box;
 281 
 282     // Load the mark.
 283     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 284 
 285     // Prefetch top.
 286     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 287 
 288     // Check for monitor (0b10).
 289     testptr(mark, markWord::monitor_value);
 290     jcc(Assembler::notZero, inflated);
 291 
 292     // Check if lock-stack is full.
 293     cmpl(top, LockStack::end_offset() - 1);
 294     jcc(Assembler::greater, slow_path);
 295 
 296     // Check if recursive.
 297     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 298     jccb(Assembler::equal, push);
 299 
 300     // Try to lock. Transition lock bits 0b01 => 0b00
 301     movptr(rax_reg, mark);
 302     orptr(rax_reg, markWord::unlocked_value);
 303     andptr(mark, ~(int32_t)markWord::unlocked_value);
 304     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 305     jcc(Assembler::notEqual, slow_path);
 306 
 307     if (UseObjectMonitorTable) {
 308       // Need to reload top, clobbered by CAS.
 309       movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 310     }
 311     bind(push);
 312     // After successful lock, push object on lock-stack.
 313     movptr(Address(thread, top), obj);
 314     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 315     jmpb(locked);
 316   }
 317 
 318   { // Handle inflated monitor.
 319     bind(inflated);
 320 
 321     const Register monitor = t;
 322 
 323     if (!UseObjectMonitorTable) {
 324       assert(mark == monitor, "should be the same here");
 325     } else {
 326       // Uses ObjectMonitorTable.  Look for the monitor in the om_cache.
 327       // Fetch ObjectMonitor* from the cache or take the slow-path.
 328       Label monitor_found;
 329 
 330       // Load cache address
 331       lea(t, Address(thread, JavaThread::om_cache_oops_offset()));
 332 
 333       const int num_unrolled = 2;
 334       for (int i = 0; i < num_unrolled; i++) {
 335         cmpptr(obj, Address(t));
 336         jccb(Assembler::equal, monitor_found);
 337         increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 338       }
 339 
 340       Label loop;
 341 
 342       // Search for obj in cache.
 343       bind(loop);
 344 
 345       // Check for match.
 346       cmpptr(obj, Address(t));
 347       jccb(Assembler::equal, monitor_found);
 348 
 349       // Search until null encountered, guaranteed _null_sentinel at end.
 350       cmpptr(Address(t), 1);
 351       jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0
 352       increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 353       jmpb(loop);
 354 
 355       // Cache hit.
 356       bind(monitor_found);
 357       movptr(monitor, Address(t, OMCache::oop_to_monitor_difference()));
 358     }
 359     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 360     const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 361     const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
 362 
 363     Label monitor_locked;
 364     // Lock the monitor.
 365 
 366     if (UseObjectMonitorTable) {
 367       // Cache the monitor for unlock before trashing box. On failure to acquire
 368       // the lock, the slow path will reset the entry accordingly (see CacheSetter).
 369       movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
 370     }
 371 
 372     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 373     xorptr(rax_reg, rax_reg);
 374     movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
 375     lock(); cmpxchgptr(box, owner_address);
 376     jccb(Assembler::equal, monitor_locked);
 377 
 378     // Check if recursive.
 379     cmpptr(box, rax_reg);
 380     jccb(Assembler::notEqual, slow_path);
 381 
 382     // Recursive.
 383     increment(recursions_address);
 384 
 385     bind(monitor_locked);
 386   }
 387 
 388   bind(locked);
 389   // Set ZF = 1
 390   xorl(rax_reg, rax_reg);
 391 
 392 #ifdef ASSERT
 393   // Check that locked label is reached with ZF set.
 394   Label zf_correct;
 395   Label zf_bad_zero;
 396   jcc(Assembler::zero, zf_correct);
 397   jmp(zf_bad_zero);
 398 #endif
 399 
 400   bind(slow_path);
 401 #ifdef ASSERT
 402   // Check that slow_path label is reached with ZF not set.
 403   jcc(Assembler::notZero, zf_correct);
 404   stop("Fast Lock ZF != 0");
 405   bind(zf_bad_zero);
 406   stop("Fast Lock ZF != 1");
 407   bind(zf_correct);
 408 #endif
 409   // C2 uses the value of ZF to determine the continuation.
 410 }
 411 
 412 // obj: object to lock
 413 // rax: tmp -- KILLED
 414 // t  : tmp - cannot be obj nor rax -- KILLED
 415 //
 416 // Some commentary on balanced locking:
 417 //
 418 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 419 // Methods that don't have provably balanced locking are forced to run in the
 420 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 421 // The interpreter provides two properties:
 422 // I1:  At return-time the interpreter automatically and quietly unlocks any
 423 //      objects acquired in the current activation (frame).  Recall that the
 424 //      interpreter maintains an on-stack list of locks currently held by
 425 //      a frame.
 426 // I2:  If a method attempts to unlock an object that is not held by the
 427 //      frame the interpreter throws IMSX.
 428 //
 429 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 430 // B() doesn't have provably balanced locking so it runs in the interpreter.
 431 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 432 // is still locked by A().
 433 //
 434 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface
 435 // Specification" states that an object locked by JNI's MonitorEnter should not be
 436 // unlocked by "normal" java-level locking and vice-versa.  The specification doesn't
 437 // specify what will occur if a program engages in such mixed-mode locking, however.
 438 // Arguably given that the spec legislates the JNI case as undefined our implementation
 439 // could reasonably *avoid* checking owner in fast_unlock().
 440 // In the interest of performance we elide m->Owner==Self check in unlock.
 441 // A perfectly viable alternative is to elide the owner check except when
 442 // Xcheck:jni is enabled.
 443 
 444 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) {
 445   assert(reg_rax == rax, "Used for CAS");
 446   assert_different_registers(obj, reg_rax, t);
 447 
 448   // Handle inflated monitor.
 449   Label inflated, inflated_check_lock_stack;
 450   // Finish fast unlock successfully.  MUST jump with ZF == 1
 451   Label unlocked, slow_path;
 452 
 453   const Register mark = t;
 454   const Register monitor = t;
 455   const Register top = UseObjectMonitorTable ? t : reg_rax;
 456   const Register box = reg_rax;
 457 
 458   Label dummy;
 459   C2FastUnlockLightweightStub* stub = nullptr;
 460 
 461   if (!Compile::current()->output()->in_scratch_emit_size()) {
 462     stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread);
 463     Compile::current()->output()->add_stub(stub);
 464   }
 465 
 466   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 467 
 468   { // Lightweight Unlock
 469 
 470     // Load top.
 471     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 472 
 473     if (!UseObjectMonitorTable) {
 474       // Prefetch mark.
 475       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 476     }
 477 
 478     // Check if obj is top of lock-stack.
 479     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 480     // Top of lock stack was not obj. Must be monitor.
 481     jcc(Assembler::notEqual, inflated_check_lock_stack);
 482 
 483     // Pop lock-stack.
 484     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 485     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 486 
 487     // Check if recursive.
 488     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 489     jcc(Assembler::equal, unlocked);
 490 
 491     // We elide the monitor check, let the CAS fail instead.
 492 
 493     if (UseObjectMonitorTable) {
 494       // Load mark.
 495       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 496     }
 497 
 498     // Try to unlock. Transition lock bits 0b00 => 0b01
 499     movptr(reg_rax, mark);
 500     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
 501     orptr(mark, markWord::unlocked_value);
 502     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 503     jcc(Assembler::notEqual, push_and_slow_path);
 504     jmp(unlocked);
 505   }
 506 
 507 
 508   { // Handle inflated monitor.
 509     bind(inflated_check_lock_stack);
 510 #ifdef ASSERT
 511     Label check_done;
 512     subl(top, oopSize);
 513     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
 514     jcc(Assembler::below, check_done);
 515     cmpptr(obj, Address(thread, top));
 516     jccb(Assembler::notEqual, inflated_check_lock_stack);
 517     stop("Fast Unlock lock on stack");
 518     bind(check_done);
 519     if (UseObjectMonitorTable) {
 520       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 521     }
 522     testptr(mark, markWord::monitor_value);
 523     jccb(Assembler::notZero, inflated);
 524     stop("Fast Unlock not monitor");
 525 #endif
 526 
 527     bind(inflated);
 528 
 529     if (!UseObjectMonitorTable) {
 530       assert(mark == monitor, "should be the same here");
 531     } else {
 532       // Uses ObjectMonitorTable.  Look for the monitor in our BasicLock on the stack.
 533       movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 534       // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
 535       cmpptr(monitor, alignof(ObjectMonitor*));
 536       jcc(Assembler::below, slow_path);
 537     }
 538     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 539     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
 540     const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
 541     const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag};
 542     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
 543 
 544     Label recursive;
 545 
 546     // Check if recursive.
 547     cmpptr(recursions_address, 0);
 548     jccb(Assembler::notZero, recursive);
 549 
 550     // Set owner to null.
 551     // Release to satisfy the JMM
 552     movptr(owner_address, NULL_WORD);
 553     // We need a full fence after clearing owner to avoid stranding.
 554     // StoreLoad achieves this.
 555     membar(StoreLoad);
 556 
 557     // Check if the entry_list is empty.
 558     cmpptr(entry_list_address, NULL_WORD);
 559     jccb(Assembler::zero, unlocked);    // If so we are done.
 560 
 561     // Check if there is a successor.
 562     cmpptr(succ_address, NULL_WORD);
 563     jccb(Assembler::notZero, unlocked); // If so we are done.
 564 
 565     // Save the monitor pointer in the current thread, so we can try to
 566     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 567     if (!UseObjectMonitorTable) {
 568       andptr(monitor, ~(int32_t)markWord::monitor_value);
 569     }
 570     movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
 571 
 572     orl(t, 1); // Fast Unlock ZF = 0
 573     jmpb(slow_path);
 574 
 575     // Recursive unlock.
 576     bind(recursive);
 577     decrement(recursions_address);
 578   }
 579 
 580   bind(unlocked);
 581   xorl(t, t); // Fast Unlock ZF = 1
 582 
 583 #ifdef ASSERT
 584   // Check that unlocked label is reached with ZF set.
 585   Label zf_correct;
 586   Label zf_bad_zero;
 587   jcc(Assembler::zero, zf_correct);
 588   jmp(zf_bad_zero);
 589 #endif
 590 
 591   bind(slow_path);
 592   if (stub != nullptr) {
 593     bind(stub->slow_path_continuation());
 594   }
 595 #ifdef ASSERT
 596   // Check that stub->continuation() label is reached with ZF not set.
 597   jcc(Assembler::notZero, zf_correct);
 598   stop("Fast Unlock ZF != 0");
 599   bind(zf_bad_zero);
 600   stop("Fast Unlock ZF != 1");
 601   bind(zf_correct);
 602 #endif
 603   // C2 uses the value of ZF to determine the continuation.
 604 }
 605 
 606 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
 607   fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
 608 }
 609 
 610 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) {
 611   const int framesize = Compile::current()->output()->frame_size_in_bytes();
 612   masm->movptr(dst, rsp);
 613   if (framesize > 2 * wordSize) {
 614     masm->addptr(dst, framesize - 2 * wordSize);
 615   }
 616 }
 617 
 618 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
 619   if (PreserveFramePointer) {
 620     // frame pointer is valid
 621 #ifdef ASSERT
 622     // Verify frame pointer value in rbp.
 623     reconstruct_frame_pointer_helper(this, rtmp);
 624     Label L_success;
 625     cmpq(rbp, rtmp);
 626     jccb(Assembler::equal, L_success);
 627     STOP("frame pointer mismatch");
 628     bind(L_success);
 629 #endif // ASSERT
 630   } else {
 631     reconstruct_frame_pointer_helper(this, rbp);
 632   }
 633 }
 634 
 635 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) {
 636   jint lo = t->_lo;
 637   jint hi = t->_hi;
 638   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi);
 639   if (t == TypeInt::INT) {
 640     return;
 641   }
 642 
 643   BLOCK_COMMENT("CastII {");
 644   Label fail;
 645   Label succeed;
 646   if (hi == max_jint) {
 647     cmpl(val, lo);
 648     jccb(Assembler::greaterEqual, succeed);
 649   } else {
 650     if (lo != min_jint) {
 651       cmpl(val, lo);
 652       jccb(Assembler::less, fail);
 653     }
 654     cmpl(val, hi);
 655     jccb(Assembler::lessEqual, succeed);
 656   }
 657 
 658   bind(fail);
 659   movl(c_rarg0, idx);
 660   movl(c_rarg1, val);
 661   movl(c_rarg2, lo);
 662   movl(c_rarg3, hi);
 663   reconstruct_frame_pointer(rscratch1);
 664   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range)));
 665   hlt();
 666   bind(succeed);
 667   BLOCK_COMMENT("} // CastII");
 668 }
 669 
 670 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
 671   fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
 672 }
 673 
 674 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) {
 675   jlong lo = t->_lo;
 676   jlong hi = t->_hi;
 677   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi);
 678   if (t == TypeLong::LONG) {
 679     return;
 680   }
 681 
 682   BLOCK_COMMENT("CastLL {");
 683   Label fail;
 684   Label succeed;
 685 
 686   auto cmp_val = [&](jlong bound) {
 687     if (is_simm32(bound)) {
 688       cmpq(val, checked_cast<int>(bound));
 689     } else {
 690       mov64(tmp, bound);
 691       cmpq(val, tmp);
 692     }
 693   };
 694 
 695   if (hi == max_jlong) {
 696     cmp_val(lo);
 697     jccb(Assembler::greaterEqual, succeed);
 698   } else {
 699     if (lo != min_jlong) {
 700       cmp_val(lo);
 701       jccb(Assembler::less, fail);
 702     }
 703     cmp_val(hi);
 704     jccb(Assembler::lessEqual, succeed);
 705   }
 706 
 707   bind(fail);
 708   movl(c_rarg0, idx);
 709   movq(c_rarg1, val);
 710   mov64(c_rarg2, lo);
 711   mov64(c_rarg3, hi);
 712   reconstruct_frame_pointer(rscratch1);
 713   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range)));
 714   hlt();
 715   bind(succeed);
 716   BLOCK_COMMENT("} // CastLL");
 717 }
 718 
 719 //-------------------------------------------------------------------------------------------
 720 // Generic instructions support for use in .ad files C2 code generation
 721 
 722 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 723   if (dst != src) {
 724     movdqu(dst, src);
 725   }
 726   if (opcode == Op_AbsVD) {
 727     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 728   } else {
 729     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 730     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 731   }
 732 }
 733 
 734 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 735   if (opcode == Op_AbsVD) {
 736     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 737   } else {
 738     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 739     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 740   }
 741 }
 742 
 743 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 744   if (dst != src) {
 745     movdqu(dst, src);
 746   }
 747   if (opcode == Op_AbsVF) {
 748     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 749   } else {
 750     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 751     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 752   }
 753 }
 754 
 755 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 756   if (opcode == Op_AbsVF) {
 757     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 758   } else {
 759     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 760     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 761   }
 762 }
 763 
 764 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 765   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 766   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 767 
 768   if (opcode == Op_MinV) {
 769     if (elem_bt == T_BYTE) {
 770       pminsb(dst, src);
 771     } else if (elem_bt == T_SHORT) {
 772       pminsw(dst, src);
 773     } else if (elem_bt == T_INT) {
 774       pminsd(dst, src);
 775     } else {
 776       assert(elem_bt == T_LONG, "required");
 777       assert(tmp == xmm0, "required");
 778       assert_different_registers(dst, src, tmp);
 779       movdqu(xmm0, dst);
 780       pcmpgtq(xmm0, src);
 781       blendvpd(dst, src);  // xmm0 as mask
 782     }
 783   } else { // opcode == Op_MaxV
 784     if (elem_bt == T_BYTE) {
 785       pmaxsb(dst, src);
 786     } else if (elem_bt == T_SHORT) {
 787       pmaxsw(dst, src);
 788     } else if (elem_bt == T_INT) {
 789       pmaxsd(dst, src);
 790     } else {
 791       assert(elem_bt == T_LONG, "required");
 792       assert(tmp == xmm0, "required");
 793       assert_different_registers(dst, src, tmp);
 794       movdqu(xmm0, src);
 795       pcmpgtq(xmm0, dst);
 796       blendvpd(dst, src);  // xmm0 as mask
 797     }
 798   }
 799 }
 800 
 801 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 802                                   XMMRegister src1, Address src2, int vlen_enc) {
 803   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 804   if (opcode == Op_UMinV) {
 805     switch(elem_bt) {
 806       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 807       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 808       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 809       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 810       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 811     }
 812   } else {
 813     assert(opcode == Op_UMaxV, "required");
 814     switch(elem_bt) {
 815       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 816       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 817       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 818       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 819       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 820     }
 821   }
 822 }
 823 
 824 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
 825   // For optimality, leverage a full vector width of 512 bits
 826   // for operations over smaller vector sizes on AVX512 targets.
 827   if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 828     if (opcode == Op_UMaxV) {
 829       evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 830     } else {
 831       assert(opcode == Op_UMinV, "required");
 832       evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 833     }
 834   } else {
 835     // T1 = -1
 836     vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
 837     // T1 = -1 << 63
 838     vpsllq(xtmp1, xtmp1, 63, vlen_enc);
 839     // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
 840     vpaddq(xtmp2, xtmp1, src2, vlen_enc);
 841     // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
 842     vpaddq(xtmp1, xtmp1, src1, vlen_enc);
 843     // Mask = T2 > T1
 844     vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
 845     if (opcode == Op_UMaxV) {
 846       // Res = Mask ? Src2 : Src1
 847       vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
 848     } else {
 849       // Res = Mask ? Src1 : Src2
 850       vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
 851     }
 852   }
 853 }
 854 
 855 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 856                                   XMMRegister src1, XMMRegister src2, int vlen_enc) {
 857   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 858   if (opcode == Op_UMinV) {
 859     switch(elem_bt) {
 860       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 861       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 862       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 863       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 864       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 865     }
 866   } else {
 867     assert(opcode == Op_UMaxV, "required");
 868     switch(elem_bt) {
 869       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 870       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 871       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 872       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 873       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 874     }
 875   }
 876 }
 877 
 878 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 879                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 880                                  int vlen_enc) {
 881   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 882 
 883   if (opcode == Op_MinV) {
 884     if (elem_bt == T_BYTE) {
 885       vpminsb(dst, src1, src2, vlen_enc);
 886     } else if (elem_bt == T_SHORT) {
 887       vpminsw(dst, src1, src2, vlen_enc);
 888     } else if (elem_bt == T_INT) {
 889       vpminsd(dst, src1, src2, vlen_enc);
 890     } else {
 891       assert(elem_bt == T_LONG, "required");
 892       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 893         vpminsq(dst, src1, src2, vlen_enc);
 894       } else {
 895         assert_different_registers(dst, src1, src2);
 896         vpcmpgtq(dst, src1, src2, vlen_enc);
 897         vblendvpd(dst, src1, src2, dst, vlen_enc);
 898       }
 899     }
 900   } else { // opcode == Op_MaxV
 901     if (elem_bt == T_BYTE) {
 902       vpmaxsb(dst, src1, src2, vlen_enc);
 903     } else if (elem_bt == T_SHORT) {
 904       vpmaxsw(dst, src1, src2, vlen_enc);
 905     } else if (elem_bt == T_INT) {
 906       vpmaxsd(dst, src1, src2, vlen_enc);
 907     } else {
 908       assert(elem_bt == T_LONG, "required");
 909       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 910         vpmaxsq(dst, src1, src2, vlen_enc);
 911       } else {
 912         assert_different_registers(dst, src1, src2);
 913         vpcmpgtq(dst, src1, src2, vlen_enc);
 914         vblendvpd(dst, src2, src1, dst, vlen_enc);
 915       }
 916     }
 917   }
 918 }
 919 
 920 // Float/Double min max
 921 
 922 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
 923                                    XMMRegister dst, XMMRegister a, XMMRegister b,
 924                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
 925                                    int vlen_enc) {
 926   assert(UseAVX > 0, "required");
 927   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 928          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 929   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 930   assert_different_registers(a, tmp, atmp, btmp);
 931   assert_different_registers(b, tmp, atmp, btmp);
 932 
 933   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 934   bool is_double_word = is_double_word_type(elem_bt);
 935 
 936   /* Note on 'non-obvious' assembly sequence:
 937    *
 938    * While there are vminps/vmaxps instructions, there are two important differences between hardware
 939    * and Java on how they handle floats:
 940    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
 941    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
 942    *
 943    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
 944    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
 945    *                (only useful when signs differ, noop otherwise)
 946    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
 947 
 948    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
 949    *   btmp = (b < +0.0) ? a : b
 950    *   atmp = (b < +0.0) ? b : a
 951    *   Tmp  = Max_Float(atmp , btmp)
 952    *   Res  = (atmp == NaN) ? atmp : Tmp
 953    */
 954 
 955   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
 956   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
 957   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
 958   XMMRegister mask;
 959 
 960   if (!is_double_word && is_min) {
 961     mask = a;
 962     vblend = &MacroAssembler::vblendvps;
 963     vmaxmin = &MacroAssembler::vminps;
 964     vcmp = &MacroAssembler::vcmpps;
 965   } else if (!is_double_word && !is_min) {
 966     mask = b;
 967     vblend = &MacroAssembler::vblendvps;
 968     vmaxmin = &MacroAssembler::vmaxps;
 969     vcmp = &MacroAssembler::vcmpps;
 970   } else if (is_double_word && is_min) {
 971     mask = a;
 972     vblend = &MacroAssembler::vblendvpd;
 973     vmaxmin = &MacroAssembler::vminpd;
 974     vcmp = &MacroAssembler::vcmppd;
 975   } else {
 976     assert(is_double_word && !is_min, "sanity");
 977     mask = b;
 978     vblend = &MacroAssembler::vblendvpd;
 979     vmaxmin = &MacroAssembler::vmaxpd;
 980     vcmp = &MacroAssembler::vcmppd;
 981   }
 982 
 983   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
 984   XMMRegister maxmin, scratch;
 985   if (dst == btmp) {
 986     maxmin = btmp;
 987     scratch = tmp;
 988   } else {
 989     maxmin = tmp;
 990     scratch = btmp;
 991   }
 992 
 993   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
 994   if (precompute_mask && !is_double_word) {
 995     vpsrad(tmp, mask, 32, vlen_enc);
 996     mask = tmp;
 997   } else if (precompute_mask && is_double_word) {
 998     vpxor(tmp, tmp, tmp, vlen_enc);
 999     vpcmpgtq(tmp, tmp, mask, vlen_enc);
1000     mask = tmp;
1001   }
1002 
1003   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1004   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1005   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1006   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1007   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1008 }
1009 
1010 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1011                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1012                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1013                                     int vlen_enc) {
1014   assert(UseAVX > 2, "required");
1015   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1016          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1017   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1018   assert_different_registers(dst, a, atmp, btmp);
1019   assert_different_registers(dst, b, atmp, btmp);
1020 
1021   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1022   bool is_double_word = is_double_word_type(elem_bt);
1023   bool merge = true;
1024 
1025   if (!is_double_word && is_min) {
1026     evpmovd2m(ktmp, a, vlen_enc);
1027     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1028     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1029     vminps(dst, atmp, btmp, vlen_enc);
1030     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1031     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1032   } else if (!is_double_word && !is_min) {
1033     evpmovd2m(ktmp, b, vlen_enc);
1034     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1035     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1036     vmaxps(dst, atmp, btmp, vlen_enc);
1037     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1038     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1039   } else if (is_double_word && is_min) {
1040     evpmovq2m(ktmp, a, vlen_enc);
1041     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1042     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1043     vminpd(dst, atmp, btmp, vlen_enc);
1044     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1045     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1046   } else {
1047     assert(is_double_word && !is_min, "sanity");
1048     evpmovq2m(ktmp, b, vlen_enc);
1049     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1050     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1051     vmaxpd(dst, atmp, btmp, vlen_enc);
1052     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1053     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1054   }
1055 }
1056 
1057 void C2_MacroAssembler::vminmax_fp(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1058                                    XMMRegister src1, XMMRegister src2, int vlen_enc) {
1059   assert(opc == Op_MinV || opc == Op_MinReductionV ||
1060          opc == Op_MaxV || opc == Op_MaxReductionV, "sanity");
1061 
1062   int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_MINMAX_MIN_COMPARE_SIGN
1063                                                          : AVX10_MINMAX_MAX_COMPARE_SIGN;
1064   if (elem_bt == T_FLOAT) {
1065     evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc);
1066   } else {
1067     assert(elem_bt == T_DOUBLE, "");
1068     evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc);
1069   }
1070 }
1071 
1072 // Float/Double signum
1073 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1074   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1075 
1076   Label DONE_LABEL;
1077 
1078   if (opcode == Op_SignumF) {
1079     ucomiss(dst, zero);
1080     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1081     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1082     movflt(dst, one);
1083     jcc(Assembler::above, DONE_LABEL);
1084     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1085   } else if (opcode == Op_SignumD) {
1086     ucomisd(dst, zero);
1087     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1088     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1089     movdbl(dst, one);
1090     jcc(Assembler::above, DONE_LABEL);
1091     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1092   }
1093 
1094   bind(DONE_LABEL);
1095 }
1096 
1097 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1098   if (sign) {
1099     pmovsxbw(dst, src);
1100   } else {
1101     pmovzxbw(dst, src);
1102   }
1103 }
1104 
1105 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1106   if (sign) {
1107     vpmovsxbw(dst, src, vector_len);
1108   } else {
1109     vpmovzxbw(dst, src, vector_len);
1110   }
1111 }
1112 
1113 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1114   if (sign) {
1115     vpmovsxbd(dst, src, vector_len);
1116   } else {
1117     vpmovzxbd(dst, src, vector_len);
1118   }
1119 }
1120 
1121 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1122   if (sign) {
1123     vpmovsxwd(dst, src, vector_len);
1124   } else {
1125     vpmovzxwd(dst, src, vector_len);
1126   }
1127 }
1128 
1129 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1130                                      int shift, int vector_len) {
1131   if (opcode == Op_RotateLeftV) {
1132     if (etype == T_INT) {
1133       evprold(dst, src, shift, vector_len);
1134     } else {
1135       assert(etype == T_LONG, "expected type T_LONG");
1136       evprolq(dst, src, shift, vector_len);
1137     }
1138   } else {
1139     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1140     if (etype == T_INT) {
1141       evprord(dst, src, shift, vector_len);
1142     } else {
1143       assert(etype == T_LONG, "expected type T_LONG");
1144       evprorq(dst, src, shift, vector_len);
1145     }
1146   }
1147 }
1148 
1149 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1150                                      XMMRegister shift, int vector_len) {
1151   if (opcode == Op_RotateLeftV) {
1152     if (etype == T_INT) {
1153       evprolvd(dst, src, shift, vector_len);
1154     } else {
1155       assert(etype == T_LONG, "expected type T_LONG");
1156       evprolvq(dst, src, shift, vector_len);
1157     }
1158   } else {
1159     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1160     if (etype == T_INT) {
1161       evprorvd(dst, src, shift, vector_len);
1162     } else {
1163       assert(etype == T_LONG, "expected type T_LONG");
1164       evprorvq(dst, src, shift, vector_len);
1165     }
1166   }
1167 }
1168 
1169 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1170   if (opcode == Op_RShiftVI) {
1171     psrad(dst, shift);
1172   } else if (opcode == Op_LShiftVI) {
1173     pslld(dst, shift);
1174   } else {
1175     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1176     psrld(dst, shift);
1177   }
1178 }
1179 
1180 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1181   switch (opcode) {
1182     case Op_RShiftVI:  psrad(dst, shift); break;
1183     case Op_LShiftVI:  pslld(dst, shift); break;
1184     case Op_URShiftVI: psrld(dst, shift); break;
1185 
1186     default: assert(false, "%s", NodeClassNames[opcode]);
1187   }
1188 }
1189 
1190 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1191   if (opcode == Op_RShiftVI) {
1192     vpsrad(dst, nds, shift, vector_len);
1193   } else if (opcode == Op_LShiftVI) {
1194     vpslld(dst, nds, shift, vector_len);
1195   } else {
1196     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1197     vpsrld(dst, nds, shift, vector_len);
1198   }
1199 }
1200 
1201 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1202   switch (opcode) {
1203     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1204     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1205     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1206 
1207     default: assert(false, "%s", NodeClassNames[opcode]);
1208   }
1209 }
1210 
1211 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1212   switch (opcode) {
1213     case Op_RShiftVB:  // fall-through
1214     case Op_RShiftVS:  psraw(dst, shift); break;
1215 
1216     case Op_LShiftVB:  // fall-through
1217     case Op_LShiftVS:  psllw(dst, shift);   break;
1218 
1219     case Op_URShiftVS: // fall-through
1220     case Op_URShiftVB: psrlw(dst, shift);  break;
1221 
1222     default: assert(false, "%s", NodeClassNames[opcode]);
1223   }
1224 }
1225 
1226 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1227   switch (opcode) {
1228     case Op_RShiftVB:  // fall-through
1229     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1230 
1231     case Op_LShiftVB:  // fall-through
1232     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1233 
1234     case Op_URShiftVS: // fall-through
1235     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1236 
1237     default: assert(false, "%s", NodeClassNames[opcode]);
1238   }
1239 }
1240 
1241 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1242   switch (opcode) {
1243     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1244     case Op_LShiftVL:  psllq(dst, shift); break;
1245     case Op_URShiftVL: psrlq(dst, shift); break;
1246 
1247     default: assert(false, "%s", NodeClassNames[opcode]);
1248   }
1249 }
1250 
1251 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1252   if (opcode == Op_RShiftVL) {
1253     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1254   } else if (opcode == Op_LShiftVL) {
1255     psllq(dst, shift);
1256   } else {
1257     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1258     psrlq(dst, shift);
1259   }
1260 }
1261 
1262 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1263   switch (opcode) {
1264     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1265     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1266     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1267 
1268     default: assert(false, "%s", NodeClassNames[opcode]);
1269   }
1270 }
1271 
1272 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1273   if (opcode == Op_RShiftVL) {
1274     evpsraq(dst, nds, shift, vector_len);
1275   } else if (opcode == Op_LShiftVL) {
1276     vpsllq(dst, nds, shift, vector_len);
1277   } else {
1278     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1279     vpsrlq(dst, nds, shift, vector_len);
1280   }
1281 }
1282 
1283 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1284   switch (opcode) {
1285     case Op_RShiftVB:  // fall-through
1286     case Op_RShiftVS:  // fall-through
1287     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1288 
1289     case Op_LShiftVB:  // fall-through
1290     case Op_LShiftVS:  // fall-through
1291     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1292 
1293     case Op_URShiftVB: // fall-through
1294     case Op_URShiftVS: // fall-through
1295     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1296 
1297     default: assert(false, "%s", NodeClassNames[opcode]);
1298   }
1299 }
1300 
1301 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1302   switch (opcode) {
1303     case Op_RShiftVB:  // fall-through
1304     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1305 
1306     case Op_LShiftVB:  // fall-through
1307     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1308 
1309     case Op_URShiftVB: // fall-through
1310     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1311 
1312     default: assert(false, "%s", NodeClassNames[opcode]);
1313   }
1314 }
1315 
1316 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1317   assert(UseAVX >= 2, "required");
1318   switch (opcode) {
1319     case Op_RShiftVL: {
1320       if (UseAVX > 2) {
1321         assert(tmp == xnoreg, "not used");
1322         if (!VM_Version::supports_avx512vl()) {
1323           vlen_enc = Assembler::AVX_512bit;
1324         }
1325         evpsravq(dst, src, shift, vlen_enc);
1326       } else {
1327         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1328         vpsrlvq(dst, src, shift, vlen_enc);
1329         vpsrlvq(tmp, tmp, shift, vlen_enc);
1330         vpxor(dst, dst, tmp, vlen_enc);
1331         vpsubq(dst, dst, tmp, vlen_enc);
1332       }
1333       break;
1334     }
1335     case Op_LShiftVL: {
1336       assert(tmp == xnoreg, "not used");
1337       vpsllvq(dst, src, shift, vlen_enc);
1338       break;
1339     }
1340     case Op_URShiftVL: {
1341       assert(tmp == xnoreg, "not used");
1342       vpsrlvq(dst, src, shift, vlen_enc);
1343       break;
1344     }
1345     default: assert(false, "%s", NodeClassNames[opcode]);
1346   }
1347 }
1348 
1349 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1350 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1351   assert(opcode == Op_LShiftVB ||
1352          opcode == Op_RShiftVB ||
1353          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1354   bool sign = (opcode != Op_URShiftVB);
1355   assert(vector_len == 0, "required");
1356   vextendbd(sign, dst, src, 1);
1357   vpmovzxbd(vtmp, shift, 1);
1358   varshiftd(opcode, dst, dst, vtmp, 1);
1359   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1360   vextracti128_high(vtmp, dst);
1361   vpackusdw(dst, dst, vtmp, 0);
1362 }
1363 
1364 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1365 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1366   assert(opcode == Op_LShiftVB ||
1367          opcode == Op_RShiftVB ||
1368          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1369   bool sign = (opcode != Op_URShiftVB);
1370   int ext_vector_len = vector_len + 1;
1371   vextendbw(sign, dst, src, ext_vector_len);
1372   vpmovzxbw(vtmp, shift, ext_vector_len);
1373   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1374   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1375   if (vector_len == 0) {
1376     vextracti128_high(vtmp, dst);
1377     vpackuswb(dst, dst, vtmp, vector_len);
1378   } else {
1379     vextracti64x4_high(vtmp, dst);
1380     vpackuswb(dst, dst, vtmp, vector_len);
1381     vpermq(dst, dst, 0xD8, vector_len);
1382   }
1383 }
1384 
1385 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1386   switch(typ) {
1387     case T_BYTE:
1388       pinsrb(dst, val, idx);
1389       break;
1390     case T_SHORT:
1391       pinsrw(dst, val, idx);
1392       break;
1393     case T_INT:
1394       pinsrd(dst, val, idx);
1395       break;
1396     case T_LONG:
1397       pinsrq(dst, val, idx);
1398       break;
1399     default:
1400       assert(false,"Should not reach here.");
1401       break;
1402   }
1403 }
1404 
1405 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1406   switch(typ) {
1407     case T_BYTE:
1408       vpinsrb(dst, src, val, idx);
1409       break;
1410     case T_SHORT:
1411       vpinsrw(dst, src, val, idx);
1412       break;
1413     case T_INT:
1414       vpinsrd(dst, src, val, idx);
1415       break;
1416     case T_LONG:
1417       vpinsrq(dst, src, val, idx);
1418       break;
1419     default:
1420       assert(false,"Should not reach here.");
1421       break;
1422   }
1423 }
1424 
1425 void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst,
1426                                          Register base, Register idx_base,
1427                                          Register mask, Register mask_idx,
1428                                          Register rtmp, int vlen_enc) {
1429   vpxor(dst, dst, dst, vlen_enc);
1430   if (elem_bt == T_SHORT) {
1431     for (int i = 0; i < 4; i++) {
1432       // dst[i] = mask[i] ? src[idx_base[i]] : 0
1433       Label skip_load;
1434       btq(mask, mask_idx);
1435       jccb(Assembler::carryClear, skip_load);
1436       movl(rtmp, Address(idx_base, i * 4));
1437       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1438       bind(skip_load);
1439       incq(mask_idx);
1440     }
1441   } else {
1442     assert(elem_bt == T_BYTE, "");
1443     for (int i = 0; i < 8; i++) {
1444       // dst[i] = mask[i] ? src[idx_base[i]] : 0
1445       Label skip_load;
1446       btq(mask, mask_idx);
1447       jccb(Assembler::carryClear, skip_load);
1448       movl(rtmp, Address(idx_base, i * 4));
1449       pinsrb(dst, Address(base, rtmp), i);
1450       bind(skip_load);
1451       incq(mask_idx);
1452     }
1453   }
1454 }
1455 
1456 void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst,
1457                                   Register base, Register idx_base,
1458                                   Register rtmp, int vlen_enc) {
1459   vpxor(dst, dst, dst, vlen_enc);
1460   if (elem_bt == T_SHORT) {
1461     for (int i = 0; i < 4; i++) {
1462       // dst[i] = src[idx_base[i]]
1463       movl(rtmp, Address(idx_base, i * 4));
1464       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1465     }
1466   } else {
1467     assert(elem_bt == T_BYTE, "");
1468     for (int i = 0; i < 8; i++) {
1469       // dst[i] = src[idx_base[i]]
1470       movl(rtmp, Address(idx_base, i * 4));
1471       pinsrb(dst, Address(base, rtmp), i);
1472     }
1473   }
1474 }
1475 
1476 /*
1477  * Gather using hybrid algorithm, first partially unroll scalar loop
1478  * to accumulate values from gather indices into a quad-word(64bit) slice.
1479  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1480  * permutation to place the slice into appropriate vector lane
1481  * locations in destination vector. Following pseudo code describes the
1482  * algorithm in detail:
1483  *
1484  * DST_VEC = ZERO_VEC
1485  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1486  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1487  * FOREACH_ITER:
1488  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1489  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1490  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1491  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1492  *
1493  * With each iteration, doubleword permute indices (0,1) corresponding
1494  * to gathered quadword gets right shifted by two lane positions.
1495  *
1496  */
1497 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1498                                         Register base, Register idx_base,
1499                                         Register mask, XMMRegister xtmp1,
1500                                         XMMRegister xtmp2, XMMRegister temp_dst,
1501                                         Register rtmp, Register mask_idx,
1502                                         Register length, int vector_len, int vlen_enc) {
1503   Label GATHER8_LOOP;
1504   assert(is_subword_type(elem_ty), "");
1505   movl(length, vector_len);
1506   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1507   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1508   vallones(xtmp2, vlen_enc);
1509   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1510   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1511   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1512 
1513   bind(GATHER8_LOOP);
1514     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1515     if (mask == noreg) {
1516       vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc);
1517     } else {
1518       vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc);
1519     }
1520     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1521     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1522     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1523     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1524     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1525     vpor(dst, dst, temp_dst, vlen_enc);
1526     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1527     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1528     jcc(Assembler::notEqual, GATHER8_LOOP);
1529 }
1530 
1531 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1532   switch(typ) {
1533     case T_INT:
1534       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1535       break;
1536     case T_FLOAT:
1537       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1538       break;
1539     case T_LONG:
1540       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1541       break;
1542     case T_DOUBLE:
1543       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1544       break;
1545     default:
1546       assert(false,"Should not reach here.");
1547       break;
1548   }
1549 }
1550 
1551 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1552   switch(typ) {
1553     case T_INT:
1554       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1555       break;
1556     case T_FLOAT:
1557       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1558       break;
1559     case T_LONG:
1560       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1561       break;
1562     case T_DOUBLE:
1563       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1564       break;
1565     default:
1566       assert(false,"Should not reach here.");
1567       break;
1568   }
1569 }
1570 
1571 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1572   switch(typ) {
1573     case T_INT:
1574       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1575       break;
1576     case T_FLOAT:
1577       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1578       break;
1579     case T_LONG:
1580       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1581       break;
1582     case T_DOUBLE:
1583       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1584       break;
1585     default:
1586       assert(false,"Should not reach here.");
1587       break;
1588   }
1589 }
1590 
1591 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1592   if (vlen_in_bytes <= 16) {
1593     pxor (dst, dst);
1594     psubb(dst, src);
1595     switch (elem_bt) {
1596       case T_BYTE:   /* nothing to do */ break;
1597       case T_SHORT:  pmovsxbw(dst, dst); break;
1598       case T_INT:    pmovsxbd(dst, dst); break;
1599       case T_FLOAT:  pmovsxbd(dst, dst); break;
1600       case T_LONG:   pmovsxbq(dst, dst); break;
1601       case T_DOUBLE: pmovsxbq(dst, dst); break;
1602 
1603       default: assert(false, "%s", type2name(elem_bt));
1604     }
1605   } else {
1606     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1607     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1608 
1609     vpxor (dst, dst, dst, vlen_enc);
1610     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1611 
1612     switch (elem_bt) {
1613       case T_BYTE:   /* nothing to do */            break;
1614       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1615       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1616       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1617       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1618       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1619 
1620       default: assert(false, "%s", type2name(elem_bt));
1621     }
1622   }
1623 }
1624 
1625 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1626   if (novlbwdq) {
1627     vpmovsxbd(xtmp, src, vlen_enc);
1628     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1629             Assembler::eq, true, vlen_enc, noreg);
1630   } else {
1631     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1632     vpsubb(xtmp, xtmp, src, vlen_enc);
1633     evpmovb2m(dst, xtmp, vlen_enc);
1634   }
1635 }
1636 
1637 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1638   if (is_integral_type(bt)) {
1639     switch (vlen_in_bytes) {
1640       case 4:  movdl(dst, src);   break;
1641       case 8:  movq(dst, src);    break;
1642       case 16: movdqu(dst, src);  break;
1643       case 32: vmovdqu(dst, src); break;
1644       case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1645       default: ShouldNotReachHere();
1646     }
1647   } else {
1648     switch (vlen_in_bytes) {
1649       case 4:  movflt(dst, src); break;
1650       case 8:  movdbl(dst, src); break;
1651       case 16: movups(dst, src); break;
1652       case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1653       case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1654       default: ShouldNotReachHere();
1655     }
1656   }
1657 }
1658 
1659 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1660   assert(rscratch != noreg || always_reachable(src), "missing");
1661 
1662   if (reachable(src)) {
1663     load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1664   } else {
1665     lea(rscratch, src);
1666     load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1667   }
1668 }
1669 
1670 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1671   int vlen_enc = vector_length_encoding(vlen);
1672   if (VM_Version::supports_avx()) {
1673     if (bt == T_LONG) {
1674       if (VM_Version::supports_avx2()) {
1675         vpbroadcastq(dst, src, vlen_enc);
1676       } else {
1677         vmovddup(dst, src, vlen_enc);
1678       }
1679     } else if (bt == T_DOUBLE) {
1680       if (vlen_enc != Assembler::AVX_128bit) {
1681         vbroadcastsd(dst, src, vlen_enc, noreg);
1682       } else {
1683         vmovddup(dst, src, vlen_enc);
1684       }
1685     } else {
1686       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1687         vpbroadcastd(dst, src, vlen_enc);
1688       } else {
1689         vbroadcastss(dst, src, vlen_enc);
1690       }
1691     }
1692   } else if (VM_Version::supports_sse3()) {
1693     movddup(dst, src);
1694   } else {
1695     load_vector(bt, dst, src, vlen);
1696   }
1697 }
1698 
1699 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1700   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1701   int offset = exact_log2(type2aelembytes(bt)) << 6;
1702   if (is_floating_point_type(bt)) {
1703     offset += 128;
1704   }
1705   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1706   load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1707 }
1708 
1709 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1710 
1711 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1712   int vector_len = Assembler::AVX_128bit;
1713 
1714   switch (opcode) {
1715     case Op_AndReductionV:  pand(dst, src); break;
1716     case Op_OrReductionV:   por (dst, src); break;
1717     case Op_XorReductionV:  pxor(dst, src); break;
1718     case Op_MinReductionV:
1719       switch (typ) {
1720         case T_BYTE:        pminsb(dst, src); break;
1721         case T_SHORT:       pminsw(dst, src); break;
1722         case T_INT:         pminsd(dst, src); break;
1723         case T_LONG:        assert(UseAVX > 2, "required");
1724                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1725         default:            assert(false, "wrong type");
1726       }
1727       break;
1728     case Op_MaxReductionV:
1729       switch (typ) {
1730         case T_BYTE:        pmaxsb(dst, src); break;
1731         case T_SHORT:       pmaxsw(dst, src); break;
1732         case T_INT:         pmaxsd(dst, src); break;
1733         case T_LONG:        assert(UseAVX > 2, "required");
1734                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1735         default:            assert(false, "wrong type");
1736       }
1737       break;
1738     case Op_AddReductionVF: addss(dst, src); break;
1739     case Op_AddReductionVD: addsd(dst, src); break;
1740     case Op_AddReductionVI:
1741       switch (typ) {
1742         case T_BYTE:        paddb(dst, src); break;
1743         case T_SHORT:       paddw(dst, src); break;
1744         case T_INT:         paddd(dst, src); break;
1745         default:            assert(false, "wrong type");
1746       }
1747       break;
1748     case Op_AddReductionVL: paddq(dst, src); break;
1749     case Op_MulReductionVF: mulss(dst, src); break;
1750     case Op_MulReductionVD: mulsd(dst, src); break;
1751     case Op_MulReductionVI:
1752       switch (typ) {
1753         case T_SHORT:       pmullw(dst, src); break;
1754         case T_INT:         pmulld(dst, src); break;
1755         default:            assert(false, "wrong type");
1756       }
1757       break;
1758     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1759                             evpmullq(dst, dst, src, vector_len); break;
1760     default:                assert(false, "wrong opcode");
1761   }
1762 }
1763 
1764 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1765   switch (opcode) {
1766     case Op_AddReductionVF: addps(dst, src); break;
1767     case Op_AddReductionVD: addpd(dst, src); break;
1768     case Op_MulReductionVF: mulps(dst, src); break;
1769     case Op_MulReductionVD: mulpd(dst, src); break;
1770     default:                assert(false, "%s", NodeClassNames[opcode]);
1771   }
1772 }
1773 
1774 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1775   int vector_len = Assembler::AVX_256bit;
1776 
1777   switch (opcode) {
1778     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1779     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1780     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1781     case Op_MinReductionV:
1782       switch (typ) {
1783         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1784         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1785         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1786         case T_LONG:        assert(UseAVX > 2, "required");
1787                             vpminsq(dst, src1, src2, vector_len); break;
1788         default:            assert(false, "wrong type");
1789       }
1790       break;
1791     case Op_MaxReductionV:
1792       switch (typ) {
1793         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1794         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1795         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1796         case T_LONG:        assert(UseAVX > 2, "required");
1797                             vpmaxsq(dst, src1, src2, vector_len); break;
1798         default:            assert(false, "wrong type");
1799       }
1800       break;
1801     case Op_AddReductionVI:
1802       switch (typ) {
1803         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1804         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1805         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1806         default:            assert(false, "wrong type");
1807       }
1808       break;
1809     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1810     case Op_MulReductionVI:
1811       switch (typ) {
1812         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1813         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1814         default:            assert(false, "wrong type");
1815       }
1816       break;
1817     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1818     default:                assert(false, "wrong opcode");
1819   }
1820 }
1821 
1822 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1823   int vector_len = Assembler::AVX_256bit;
1824 
1825   switch (opcode) {
1826     case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1827     case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1828     case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1829     case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1830     default:                assert(false, "%s", NodeClassNames[opcode]);
1831   }
1832 }
1833 
1834 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1835                                   XMMRegister dst, XMMRegister src,
1836                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1837   switch (opcode) {
1838     case Op_AddReductionVF:
1839     case Op_MulReductionVF:
1840       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1841       break;
1842 
1843     case Op_AddReductionVD:
1844     case Op_MulReductionVD:
1845       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1846       break;
1847 
1848     default: assert(false, "wrong opcode");
1849   }
1850 }
1851 
1852 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1853                                             XMMRegister dst, XMMRegister src,
1854                                             XMMRegister vtmp1, XMMRegister vtmp2) {
1855   switch (opcode) {
1856     case Op_AddReductionVF:
1857     case Op_MulReductionVF:
1858       unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1859       break;
1860 
1861     case Op_AddReductionVD:
1862     case Op_MulReductionVD:
1863       unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1864       break;
1865 
1866     default: assert(false, "%s", NodeClassNames[opcode]);
1867   }
1868 }
1869 
1870 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1871                              Register dst, Register src1, XMMRegister src2,
1872                              XMMRegister vtmp1, XMMRegister vtmp2) {
1873   switch (vlen) {
1874     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1875     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1876     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1877     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1878 
1879     default: assert(false, "wrong vector length");
1880   }
1881 }
1882 
1883 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1884                              Register dst, Register src1, XMMRegister src2,
1885                              XMMRegister vtmp1, XMMRegister vtmp2) {
1886   switch (vlen) {
1887     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1888     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1889     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1890     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1891 
1892     default: assert(false, "wrong vector length");
1893   }
1894 }
1895 
1896 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1897                              Register dst, Register src1, XMMRegister src2,
1898                              XMMRegister vtmp1, XMMRegister vtmp2) {
1899   switch (vlen) {
1900     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1901     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1902     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1903     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1904 
1905     default: assert(false, "wrong vector length");
1906   }
1907 }
1908 
1909 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1910                              Register dst, Register src1, XMMRegister src2,
1911                              XMMRegister vtmp1, XMMRegister vtmp2) {
1912   switch (vlen) {
1913     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1914     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1915     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1916     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1917 
1918     default: assert(false, "wrong vector length");
1919   }
1920 }
1921 
1922 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1923                              Register dst, Register src1, XMMRegister src2,
1924                              XMMRegister vtmp1, XMMRegister vtmp2) {
1925   switch (vlen) {
1926     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1927     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1928     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1929 
1930     default: assert(false, "wrong vector length");
1931   }
1932 }
1933 
1934 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1935   switch (vlen) {
1936     case 2:
1937       assert(vtmp2 == xnoreg, "");
1938       reduce2F(opcode, dst, src, vtmp1);
1939       break;
1940     case 4:
1941       assert(vtmp2 == xnoreg, "");
1942       reduce4F(opcode, dst, src, vtmp1);
1943       break;
1944     case 8:
1945       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1946       break;
1947     case 16:
1948       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1949       break;
1950     default: assert(false, "wrong vector length");
1951   }
1952 }
1953 
1954 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1955   switch (vlen) {
1956     case 2:
1957       assert(vtmp2 == xnoreg, "");
1958       reduce2D(opcode, dst, src, vtmp1);
1959       break;
1960     case 4:
1961       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1962       break;
1963     case 8:
1964       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1965       break;
1966     default: assert(false, "wrong vector length");
1967   }
1968 }
1969 
1970 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1971   switch (vlen) {
1972     case 2:
1973       assert(vtmp1 == xnoreg, "");
1974       assert(vtmp2 == xnoreg, "");
1975       unorderedReduce2F(opcode, dst, src);
1976       break;
1977     case 4:
1978       assert(vtmp2 == xnoreg, "");
1979       unorderedReduce4F(opcode, dst, src, vtmp1);
1980       break;
1981     case 8:
1982       unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
1983       break;
1984     case 16:
1985       unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
1986       break;
1987     default: assert(false, "wrong vector length");
1988   }
1989 }
1990 
1991 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1992   switch (vlen) {
1993     case 2:
1994       assert(vtmp1 == xnoreg, "");
1995       assert(vtmp2 == xnoreg, "");
1996       unorderedReduce2D(opcode, dst, src);
1997       break;
1998     case 4:
1999       assert(vtmp2 == xnoreg, "");
2000       unorderedReduce4D(opcode, dst, src, vtmp1);
2001       break;
2002     case 8:
2003       unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2004       break;
2005     default: assert(false, "wrong vector length");
2006   }
2007 }
2008 
2009 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2010   if (opcode == Op_AddReductionVI) {
2011     if (vtmp1 != src2) {
2012       movdqu(vtmp1, src2);
2013     }
2014     phaddd(vtmp1, vtmp1);
2015   } else {
2016     pshufd(vtmp1, src2, 0x1);
2017     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2018   }
2019   movdl(vtmp2, src1);
2020   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2021   movdl(dst, vtmp1);
2022 }
2023 
2024 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2025   if (opcode == Op_AddReductionVI) {
2026     if (vtmp1 != src2) {
2027       movdqu(vtmp1, src2);
2028     }
2029     phaddd(vtmp1, src2);
2030     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2031   } else {
2032     pshufd(vtmp2, src2, 0xE);
2033     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2034     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2035   }
2036 }
2037 
2038 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2039   if (opcode == Op_AddReductionVI) {
2040     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2041     vextracti128_high(vtmp2, vtmp1);
2042     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2043     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2044   } else {
2045     vextracti128_high(vtmp1, src2);
2046     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2047     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2048   }
2049 }
2050 
2051 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2052   vextracti64x4_high(vtmp2, src2);
2053   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2054   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2055 }
2056 
2057 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2058   pshufd(vtmp2, src2, 0x1);
2059   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2060   movdqu(vtmp1, vtmp2);
2061   psrldq(vtmp1, 2);
2062   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2063   movdqu(vtmp2, vtmp1);
2064   psrldq(vtmp2, 1);
2065   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2066   movdl(vtmp2, src1);
2067   pmovsxbd(vtmp1, vtmp1);
2068   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2069   pextrb(dst, vtmp1, 0x0);
2070   movsbl(dst, dst);
2071 }
2072 
2073 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2074   pshufd(vtmp1, src2, 0xE);
2075   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2076   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2077 }
2078 
2079 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2080   vextracti128_high(vtmp2, src2);
2081   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2082   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2083 }
2084 
2085 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2086   vextracti64x4_high(vtmp1, src2);
2087   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2088   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2089 }
2090 
2091 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2092   pmovsxbw(vtmp2, src2);
2093   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2094 }
2095 
2096 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2097   if (UseAVX > 1) {
2098     int vector_len = Assembler::AVX_256bit;
2099     vpmovsxbw(vtmp1, src2, vector_len);
2100     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2101   } else {
2102     pmovsxbw(vtmp2, src2);
2103     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2104     pshufd(vtmp2, src2, 0x1);
2105     pmovsxbw(vtmp2, src2);
2106     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2107   }
2108 }
2109 
2110 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2111   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2112     int vector_len = Assembler::AVX_512bit;
2113     vpmovsxbw(vtmp1, src2, vector_len);
2114     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2115   } else {
2116     assert(UseAVX >= 2,"Should not reach here.");
2117     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2118     vextracti128_high(vtmp2, src2);
2119     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2120   }
2121 }
2122 
2123 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2124   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2125   vextracti64x4_high(vtmp2, src2);
2126   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2127 }
2128 
2129 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2130   if (opcode == Op_AddReductionVI) {
2131     if (vtmp1 != src2) {
2132       movdqu(vtmp1, src2);
2133     }
2134     phaddw(vtmp1, vtmp1);
2135     phaddw(vtmp1, vtmp1);
2136   } else {
2137     pshufd(vtmp2, src2, 0x1);
2138     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2139     movdqu(vtmp1, vtmp2);
2140     psrldq(vtmp1, 2);
2141     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2142   }
2143   movdl(vtmp2, src1);
2144   pmovsxwd(vtmp1, vtmp1);
2145   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2146   pextrw(dst, vtmp1, 0x0);
2147   movswl(dst, dst);
2148 }
2149 
2150 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2151   if (opcode == Op_AddReductionVI) {
2152     if (vtmp1 != src2) {
2153       movdqu(vtmp1, src2);
2154     }
2155     phaddw(vtmp1, src2);
2156   } else {
2157     pshufd(vtmp1, src2, 0xE);
2158     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2159   }
2160   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2161 }
2162 
2163 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2164   if (opcode == Op_AddReductionVI) {
2165     int vector_len = Assembler::AVX_256bit;
2166     vphaddw(vtmp2, src2, src2, vector_len);
2167     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2168   } else {
2169     vextracti128_high(vtmp2, src2);
2170     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2171   }
2172   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2173 }
2174 
2175 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2176   int vector_len = Assembler::AVX_256bit;
2177   vextracti64x4_high(vtmp1, src2);
2178   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2179   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2180 }
2181 
2182 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2183   pshufd(vtmp2, src2, 0xE);
2184   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2185   movdq(vtmp1, src1);
2186   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2187   movdq(dst, vtmp1);
2188 }
2189 
2190 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2191   vextracti128_high(vtmp1, src2);
2192   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2193   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2194 }
2195 
2196 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2197   vextracti64x4_high(vtmp2, src2);
2198   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2199   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2200 }
2201 
2202 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2203   mov64(temp, -1L);
2204   bzhiq(temp, temp, len);
2205   kmovql(dst, temp);
2206 }
2207 
2208 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2209   reduce_operation_128(T_FLOAT, opcode, dst, src);
2210   pshufd(vtmp, src, 0x1);
2211   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2212 }
2213 
2214 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2215   reduce2F(opcode, dst, src, vtmp);
2216   pshufd(vtmp, src, 0x2);
2217   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2218   pshufd(vtmp, src, 0x3);
2219   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2220 }
2221 
2222 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2223   reduce4F(opcode, dst, src, vtmp2);
2224   vextractf128_high(vtmp2, src);
2225   reduce4F(opcode, dst, vtmp2, vtmp1);
2226 }
2227 
2228 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2229   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2230   vextracti64x4_high(vtmp1, src);
2231   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2232 }
2233 
2234 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2235   pshufd(dst, src, 0x1);
2236   reduce_operation_128(T_FLOAT, opcode, dst, src);
2237 }
2238 
2239 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2240   pshufd(vtmp, src, 0xE);
2241   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2242   unorderedReduce2F(opcode, dst, vtmp);
2243 }
2244 
2245 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2246   vextractf128_high(vtmp1, src);
2247   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2248   unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2249 }
2250 
2251 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2252   vextractf64x4_high(vtmp2, src);
2253   unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2254   unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2255 }
2256 
2257 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2258   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2259   pshufd(vtmp, src, 0xE);
2260   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2261 }
2262 
2263 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2264   reduce2D(opcode, dst, src, vtmp2);
2265   vextractf128_high(vtmp2, src);
2266   reduce2D(opcode, dst, vtmp2, vtmp1);
2267 }
2268 
2269 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2270   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2271   vextracti64x4_high(vtmp1, src);
2272   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2273 }
2274 
2275 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2276   pshufd(dst, src, 0xE);
2277   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2278 }
2279 
2280 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2281   vextractf128_high(vtmp, src);
2282   unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2283   unorderedReduce2D(opcode, dst, vtmp);
2284 }
2285 
2286 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2287   vextractf64x4_high(vtmp2, src);
2288   unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2289   unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2290 }
2291 
2292 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2293   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2294 }
2295 
2296 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2297   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2298 }
2299 
2300 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2301   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2302 }
2303 
2304 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2305                                  int vec_enc) {
2306   switch(elem_bt) {
2307     case T_INT:
2308     case T_FLOAT:
2309       vmaskmovps(dst, src, mask, vec_enc);
2310       break;
2311     case T_LONG:
2312     case T_DOUBLE:
2313       vmaskmovpd(dst, src, mask, vec_enc);
2314       break;
2315     default:
2316       fatal("Unsupported type %s", type2name(elem_bt));
2317       break;
2318   }
2319 }
2320 
2321 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2322                                  int vec_enc) {
2323   switch(elem_bt) {
2324     case T_INT:
2325     case T_FLOAT:
2326       vmaskmovps(dst, src, mask, vec_enc);
2327       break;
2328     case T_LONG:
2329     case T_DOUBLE:
2330       vmaskmovpd(dst, src, mask, vec_enc);
2331       break;
2332     default:
2333       fatal("Unsupported type %s", type2name(elem_bt));
2334       break;
2335   }
2336 }
2337 
2338 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2339                                           XMMRegister dst, XMMRegister src,
2340                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2341                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2342   const int permconst[] = {1, 14};
2343   XMMRegister wsrc = src;
2344   XMMRegister wdst = xmm_0;
2345   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2346 
2347   int vlen_enc = Assembler::AVX_128bit;
2348   if (vlen == 16) {
2349     vlen_enc = Assembler::AVX_256bit;
2350   }
2351 
2352   for (int i = log2(vlen) - 1; i >=0; i--) {
2353     if (i == 0 && !is_dst_valid) {
2354       wdst = dst;
2355     }
2356     if (i == 3) {
2357       vextracti64x4_high(wtmp, wsrc);
2358     } else if (i == 2) {
2359       vextracti128_high(wtmp, wsrc);
2360     } else { // i = [0,1]
2361       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2362     }
2363 
2364     if (VM_Version::supports_avx10_2()) {
2365       vminmax_fp(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc);
2366     } else {
2367       vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2368     }
2369     wsrc = wdst;
2370     vlen_enc = Assembler::AVX_128bit;
2371   }
2372   if (is_dst_valid) {
2373     if (VM_Version::supports_avx10_2()) {
2374       vminmax_fp(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit);
2375     } else {
2376       vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2377     }
2378   }
2379 }
2380 
2381 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2382                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2383                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2384   XMMRegister wsrc = src;
2385   XMMRegister wdst = xmm_0;
2386   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2387   int vlen_enc = Assembler::AVX_128bit;
2388   if (vlen == 8) {
2389     vlen_enc = Assembler::AVX_256bit;
2390   }
2391   for (int i = log2(vlen) - 1; i >=0; i--) {
2392     if (i == 0 && !is_dst_valid) {
2393       wdst = dst;
2394     }
2395     if (i == 1) {
2396       vextracti128_high(wtmp, wsrc);
2397     } else if (i == 2) {
2398       vextracti64x4_high(wtmp, wsrc);
2399     } else {
2400       assert(i == 0, "%d", i);
2401       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2402     }
2403 
2404     if (VM_Version::supports_avx10_2()) {
2405       vminmax_fp(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc);
2406     } else {
2407       vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2408     }
2409 
2410     wsrc = wdst;
2411     vlen_enc = Assembler::AVX_128bit;
2412   }
2413 
2414   if (is_dst_valid) {
2415     if (VM_Version::supports_avx10_2()) {
2416       vminmax_fp(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit);
2417     } else {
2418       vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2419     }
2420   }
2421 }
2422 
2423 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2424   switch (bt) {
2425     case T_BYTE:  pextrb(dst, src, idx); break;
2426     case T_SHORT: pextrw(dst, src, idx); break;
2427     case T_INT:   pextrd(dst, src, idx); break;
2428     case T_LONG:  pextrq(dst, src, idx); break;
2429 
2430     default:
2431       assert(false,"Should not reach here.");
2432       break;
2433   }
2434 }
2435 
2436 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2437   int esize =  type2aelembytes(typ);
2438   int elem_per_lane = 16/esize;
2439   int lane = elemindex / elem_per_lane;
2440   int eindex = elemindex % elem_per_lane;
2441 
2442   if (lane >= 2) {
2443     assert(UseAVX > 2, "required");
2444     vextractf32x4(dst, src, lane & 3);
2445     return dst;
2446   } else if (lane > 0) {
2447     assert(UseAVX > 0, "required");
2448     vextractf128(dst, src, lane);
2449     return dst;
2450   } else {
2451     return src;
2452   }
2453 }
2454 
2455 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2456   if (typ == T_BYTE) {
2457     movsbl(dst, dst);
2458   } else if (typ == T_SHORT) {
2459     movswl(dst, dst);
2460   }
2461 }
2462 
2463 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2464   int esize =  type2aelembytes(typ);
2465   int elem_per_lane = 16/esize;
2466   int eindex = elemindex % elem_per_lane;
2467   assert(is_integral_type(typ),"required");
2468 
2469   if (eindex == 0) {
2470     if (typ == T_LONG) {
2471       movq(dst, src);
2472     } else {
2473       movdl(dst, src);
2474       movsxl(typ, dst);
2475     }
2476   } else {
2477     extract(typ, dst, src, eindex);
2478     movsxl(typ, dst);
2479   }
2480 }
2481 
2482 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2483   int esize =  type2aelembytes(typ);
2484   int elem_per_lane = 16/esize;
2485   int eindex = elemindex % elem_per_lane;
2486   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2487 
2488   if (eindex == 0) {
2489     movq(dst, src);
2490   } else {
2491     if (typ == T_FLOAT) {
2492       if (UseAVX == 0) {
2493         movdqu(dst, src);
2494         shufps(dst, dst, eindex);
2495       } else {
2496         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2497       }
2498     } else {
2499       if (UseAVX == 0) {
2500         movdqu(dst, src);
2501         psrldq(dst, eindex*esize);
2502       } else {
2503         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2504       }
2505       movq(dst, dst);
2506     }
2507   }
2508   // Zero upper bits
2509   if (typ == T_FLOAT) {
2510     if (UseAVX == 0) {
2511       assert(vtmp != xnoreg, "required.");
2512       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2513       pand(dst, vtmp);
2514     } else {
2515       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2516     }
2517   }
2518 }
2519 
2520 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2521   switch(typ) {
2522     case T_BYTE:
2523     case T_BOOLEAN:
2524       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2525       break;
2526     case T_SHORT:
2527     case T_CHAR:
2528       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2529       break;
2530     case T_INT:
2531     case T_FLOAT:
2532       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2533       break;
2534     case T_LONG:
2535     case T_DOUBLE:
2536       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2537       break;
2538     default:
2539       assert(false,"Should not reach here.");
2540       break;
2541   }
2542 }
2543 
2544 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2545   assert(rscratch != noreg || always_reachable(src2), "missing");
2546 
2547   switch(typ) {
2548     case T_BOOLEAN:
2549     case T_BYTE:
2550       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2551       break;
2552     case T_CHAR:
2553     case T_SHORT:
2554       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2555       break;
2556     case T_INT:
2557     case T_FLOAT:
2558       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2559       break;
2560     case T_LONG:
2561     case T_DOUBLE:
2562       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2563       break;
2564     default:
2565       assert(false,"Should not reach here.");
2566       break;
2567   }
2568 }
2569 
2570 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2571   switch(typ) {
2572     case T_BYTE:
2573       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2574       break;
2575     case T_SHORT:
2576       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2577       break;
2578     case T_INT:
2579     case T_FLOAT:
2580       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2581       break;
2582     case T_LONG:
2583     case T_DOUBLE:
2584       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2585       break;
2586     default:
2587       assert(false,"Should not reach here.");
2588       break;
2589   }
2590 }
2591 
2592 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2593   assert(vlen_in_bytes <= 32, "");
2594   int esize = type2aelembytes(bt);
2595   if (vlen_in_bytes == 32) {
2596     assert(vtmp == xnoreg, "required.");
2597     if (esize >= 4) {
2598       vtestps(src1, src2, AVX_256bit);
2599     } else {
2600       vptest(src1, src2, AVX_256bit);
2601     }
2602     return;
2603   }
2604   if (vlen_in_bytes < 16) {
2605     // Duplicate the lower part to fill the whole register,
2606     // Don't need to do so for src2
2607     assert(vtmp != xnoreg, "required");
2608     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2609     pshufd(vtmp, src1, shuffle_imm);
2610   } else {
2611     assert(vtmp == xnoreg, "required");
2612     vtmp = src1;
2613   }
2614   if (esize >= 4 && VM_Version::supports_avx()) {
2615     vtestps(vtmp, src2, AVX_128bit);
2616   } else {
2617     ptest(vtmp, src2);
2618   }
2619 }
2620 
2621 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2622 #ifdef ASSERT
2623   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2624   bool is_bw_supported = VM_Version::supports_avx512bw();
2625   if (is_bw && !is_bw_supported) {
2626     assert(vlen_enc != Assembler::AVX_512bit, "required");
2627     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2628            "XMM register should be 0-15");
2629   }
2630 #endif // ASSERT
2631   switch (elem_bt) {
2632     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2633     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2634     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2635     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2636     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2637     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2638     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2639   }
2640 }
2641 
2642 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2643   assert(UseAVX >= 2, "required");
2644   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2645   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2646   if ((UseAVX > 2) &&
2647       (!is_bw || VM_Version::supports_avx512bw()) &&
2648       (!is_vl || VM_Version::supports_avx512vl())) {
2649     switch (elem_bt) {
2650       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2651       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2652       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2653       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2654       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2655     }
2656   } else {
2657     assert(vlen_enc != Assembler::AVX_512bit, "required");
2658     assert((dst->encoding() < 16),"XMM register should be 0-15");
2659     switch (elem_bt) {
2660       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2661       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2662       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2663       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2664       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2665       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2666       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2667     }
2668   }
2669 }
2670 
2671 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2672   switch (to_elem_bt) {
2673     case T_SHORT:
2674       vpmovsxbw(dst, src, vlen_enc);
2675       break;
2676     case T_INT:
2677       vpmovsxbd(dst, src, vlen_enc);
2678       break;
2679     case T_FLOAT:
2680       vpmovsxbd(dst, src, vlen_enc);
2681       vcvtdq2ps(dst, dst, vlen_enc);
2682       break;
2683     case T_LONG:
2684       vpmovsxbq(dst, src, vlen_enc);
2685       break;
2686     case T_DOUBLE: {
2687       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2688       vpmovsxbd(dst, src, mid_vlen_enc);
2689       vcvtdq2pd(dst, dst, vlen_enc);
2690       break;
2691     }
2692     default:
2693       fatal("Unsupported type %s", type2name(to_elem_bt));
2694       break;
2695   }
2696 }
2697 
2698 //-------------------------------------------------------------------------------------------
2699 
2700 // IndexOf for constant substrings with size >= 8 chars
2701 // which don't need to be loaded through stack.
2702 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2703                                          Register cnt1, Register cnt2,
2704                                          int int_cnt2,  Register result,
2705                                          XMMRegister vec, Register tmp,
2706                                          int ae) {
2707   ShortBranchVerifier sbv(this);
2708   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2709   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2710 
2711   // This method uses the pcmpestri instruction with bound registers
2712   //   inputs:
2713   //     xmm - substring
2714   //     rax - substring length (elements count)
2715   //     mem - scanned string
2716   //     rdx - string length (elements count)
2717   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2718   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2719   //   outputs:
2720   //     rcx - matched index in string
2721   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2722   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2723   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2724   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2725   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2726 
2727   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2728         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2729         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2730 
2731   // Note, inline_string_indexOf() generates checks:
2732   // if (substr.count > string.count) return -1;
2733   // if (substr.count == 0) return 0;
2734   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2735 
2736   // Load substring.
2737   if (ae == StrIntrinsicNode::UL) {
2738     pmovzxbw(vec, Address(str2, 0));
2739   } else {
2740     movdqu(vec, Address(str2, 0));
2741   }
2742   movl(cnt2, int_cnt2);
2743   movptr(result, str1); // string addr
2744 
2745   if (int_cnt2 > stride) {
2746     jmpb(SCAN_TO_SUBSTR);
2747 
2748     // Reload substr for rescan, this code
2749     // is executed only for large substrings (> 8 chars)
2750     bind(RELOAD_SUBSTR);
2751     if (ae == StrIntrinsicNode::UL) {
2752       pmovzxbw(vec, Address(str2, 0));
2753     } else {
2754       movdqu(vec, Address(str2, 0));
2755     }
2756     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2757 
2758     bind(RELOAD_STR);
2759     // We came here after the beginning of the substring was
2760     // matched but the rest of it was not so we need to search
2761     // again. Start from the next element after the previous match.
2762 
2763     // cnt2 is number of substring reminding elements and
2764     // cnt1 is number of string reminding elements when cmp failed.
2765     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2766     subl(cnt1, cnt2);
2767     addl(cnt1, int_cnt2);
2768     movl(cnt2, int_cnt2); // Now restore cnt2
2769 
2770     decrementl(cnt1);     // Shift to next element
2771     cmpl(cnt1, cnt2);
2772     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2773 
2774     addptr(result, (1<<scale1));
2775 
2776   } // (int_cnt2 > 8)
2777 
2778   // Scan string for start of substr in 16-byte vectors
2779   bind(SCAN_TO_SUBSTR);
2780   pcmpestri(vec, Address(result, 0), mode);
2781   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2782   subl(cnt1, stride);
2783   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2784   cmpl(cnt1, cnt2);
2785   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2786   addptr(result, 16);
2787   jmpb(SCAN_TO_SUBSTR);
2788 
2789   // Found a potential substr
2790   bind(FOUND_CANDIDATE);
2791   // Matched whole vector if first element matched (tmp(rcx) == 0).
2792   if (int_cnt2 == stride) {
2793     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2794   } else { // int_cnt2 > 8
2795     jccb(Assembler::overflow, FOUND_SUBSTR);
2796   }
2797   // After pcmpestri tmp(rcx) contains matched element index
2798   // Compute start addr of substr
2799   lea(result, Address(result, tmp, scale1));
2800 
2801   // Make sure string is still long enough
2802   subl(cnt1, tmp);
2803   cmpl(cnt1, cnt2);
2804   if (int_cnt2 == stride) {
2805     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2806   } else { // int_cnt2 > 8
2807     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2808   }
2809   // Left less then substring.
2810 
2811   bind(RET_NOT_FOUND);
2812   movl(result, -1);
2813   jmp(EXIT);
2814 
2815   if (int_cnt2 > stride) {
2816     // This code is optimized for the case when whole substring
2817     // is matched if its head is matched.
2818     bind(MATCH_SUBSTR_HEAD);
2819     pcmpestri(vec, Address(result, 0), mode);
2820     // Reload only string if does not match
2821     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2822 
2823     Label CONT_SCAN_SUBSTR;
2824     // Compare the rest of substring (> 8 chars).
2825     bind(FOUND_SUBSTR);
2826     // First 8 chars are already matched.
2827     negptr(cnt2);
2828     addptr(cnt2, stride);
2829 
2830     bind(SCAN_SUBSTR);
2831     subl(cnt1, stride);
2832     cmpl(cnt2, -stride); // Do not read beyond substring
2833     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2834     // Back-up strings to avoid reading beyond substring:
2835     // cnt1 = cnt1 - cnt2 + 8
2836     addl(cnt1, cnt2); // cnt2 is negative
2837     addl(cnt1, stride);
2838     movl(cnt2, stride); negptr(cnt2);
2839     bind(CONT_SCAN_SUBSTR);
2840     if (int_cnt2 < (int)G) {
2841       int tail_off1 = int_cnt2<<scale1;
2842       int tail_off2 = int_cnt2<<scale2;
2843       if (ae == StrIntrinsicNode::UL) {
2844         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2845       } else {
2846         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2847       }
2848       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2849     } else {
2850       // calculate index in register to avoid integer overflow (int_cnt2*2)
2851       movl(tmp, int_cnt2);
2852       addptr(tmp, cnt2);
2853       if (ae == StrIntrinsicNode::UL) {
2854         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2855       } else {
2856         movdqu(vec, Address(str2, tmp, scale2, 0));
2857       }
2858       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2859     }
2860     // Need to reload strings pointers if not matched whole vector
2861     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2862     addptr(cnt2, stride);
2863     jcc(Assembler::negative, SCAN_SUBSTR);
2864     // Fall through if found full substring
2865 
2866   } // (int_cnt2 > 8)
2867 
2868   bind(RET_FOUND);
2869   // Found result if we matched full small substring.
2870   // Compute substr offset
2871   subptr(result, str1);
2872   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2873     shrl(result, 1); // index
2874   }
2875   bind(EXIT);
2876 
2877 } // string_indexofC8
2878 
2879 // Small strings are loaded through stack if they cross page boundary.
2880 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2881                                        Register cnt1, Register cnt2,
2882                                        int int_cnt2,  Register result,
2883                                        XMMRegister vec, Register tmp,
2884                                        int ae) {
2885   ShortBranchVerifier sbv(this);
2886   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2887   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2888 
2889   //
2890   // int_cnt2 is length of small (< 8 chars) constant substring
2891   // or (-1) for non constant substring in which case its length
2892   // is in cnt2 register.
2893   //
2894   // Note, inline_string_indexOf() generates checks:
2895   // if (substr.count > string.count) return -1;
2896   // if (substr.count == 0) return 0;
2897   //
2898   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2899   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2900   // This method uses the pcmpestri instruction with bound registers
2901   //   inputs:
2902   //     xmm - substring
2903   //     rax - substring length (elements count)
2904   //     mem - scanned string
2905   //     rdx - string length (elements count)
2906   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2907   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2908   //   outputs:
2909   //     rcx - matched index in string
2910   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2911   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2912   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2913   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2914 
2915   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2916         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2917         FOUND_CANDIDATE;
2918 
2919   { //========================================================
2920     // We don't know where these strings are located
2921     // and we can't read beyond them. Load them through stack.
2922     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2923 
2924     movptr(tmp, rsp); // save old SP
2925 
2926     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2927       if (int_cnt2 == (1>>scale2)) { // One byte
2928         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2929         load_unsigned_byte(result, Address(str2, 0));
2930         movdl(vec, result); // move 32 bits
2931       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2932         // Not enough header space in 32-bit VM: 12+3 = 15.
2933         movl(result, Address(str2, -1));
2934         shrl(result, 8);
2935         movdl(vec, result); // move 32 bits
2936       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2937         load_unsigned_short(result, Address(str2, 0));
2938         movdl(vec, result); // move 32 bits
2939       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2940         movdl(vec, Address(str2, 0)); // move 32 bits
2941       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2942         movq(vec, Address(str2, 0));  // move 64 bits
2943       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2944         // Array header size is 12 bytes in 32-bit VM
2945         // + 6 bytes for 3 chars == 18 bytes,
2946         // enough space to load vec and shift.
2947         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2948         if (ae == StrIntrinsicNode::UL) {
2949           int tail_off = int_cnt2-8;
2950           pmovzxbw(vec, Address(str2, tail_off));
2951           psrldq(vec, -2*tail_off);
2952         }
2953         else {
2954           int tail_off = int_cnt2*(1<<scale2);
2955           movdqu(vec, Address(str2, tail_off-16));
2956           psrldq(vec, 16-tail_off);
2957         }
2958       }
2959     } else { // not constant substring
2960       cmpl(cnt2, stride);
2961       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2962 
2963       // We can read beyond string if srt+16 does not cross page boundary
2964       // since heaps are aligned and mapped by pages.
2965       assert(os::vm_page_size() < (int)G, "default page should be small");
2966       movl(result, str2); // We need only low 32 bits
2967       andl(result, ((int)os::vm_page_size()-1));
2968       cmpl(result, ((int)os::vm_page_size()-16));
2969       jccb(Assembler::belowEqual, CHECK_STR);
2970 
2971       // Move small strings to stack to allow load 16 bytes into vec.
2972       subptr(rsp, 16);
2973       int stk_offset = wordSize-(1<<scale2);
2974       push(cnt2);
2975 
2976       bind(COPY_SUBSTR);
2977       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2978         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2979         movb(Address(rsp, cnt2, scale2, stk_offset), result);
2980       } else if (ae == StrIntrinsicNode::UU) {
2981         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2982         movw(Address(rsp, cnt2, scale2, stk_offset), result);
2983       }
2984       decrement(cnt2);
2985       jccb(Assembler::notZero, COPY_SUBSTR);
2986 
2987       pop(cnt2);
2988       movptr(str2, rsp);  // New substring address
2989     } // non constant
2990 
2991     bind(CHECK_STR);
2992     cmpl(cnt1, stride);
2993     jccb(Assembler::aboveEqual, BIG_STRINGS);
2994 
2995     // Check cross page boundary.
2996     movl(result, str1); // We need only low 32 bits
2997     andl(result, ((int)os::vm_page_size()-1));
2998     cmpl(result, ((int)os::vm_page_size()-16));
2999     jccb(Assembler::belowEqual, BIG_STRINGS);
3000 
3001     subptr(rsp, 16);
3002     int stk_offset = -(1<<scale1);
3003     if (int_cnt2 < 0) { // not constant
3004       push(cnt2);
3005       stk_offset += wordSize;
3006     }
3007     movl(cnt2, cnt1);
3008 
3009     bind(COPY_STR);
3010     if (ae == StrIntrinsicNode::LL) {
3011       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3012       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3013     } else {
3014       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3015       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3016     }
3017     decrement(cnt2);
3018     jccb(Assembler::notZero, COPY_STR);
3019 
3020     if (int_cnt2 < 0) { // not constant
3021       pop(cnt2);
3022     }
3023     movptr(str1, rsp);  // New string address
3024 
3025     bind(BIG_STRINGS);
3026     // Load substring.
3027     if (int_cnt2 < 0) { // -1
3028       if (ae == StrIntrinsicNode::UL) {
3029         pmovzxbw(vec, Address(str2, 0));
3030       } else {
3031         movdqu(vec, Address(str2, 0));
3032       }
3033       push(cnt2);       // substr count
3034       push(str2);       // substr addr
3035       push(str1);       // string addr
3036     } else {
3037       // Small (< 8 chars) constant substrings are loaded already.
3038       movl(cnt2, int_cnt2);
3039     }
3040     push(tmp);  // original SP
3041 
3042   } // Finished loading
3043 
3044   //========================================================
3045   // Start search
3046   //
3047 
3048   movptr(result, str1); // string addr
3049 
3050   if (int_cnt2  < 0) {  // Only for non constant substring
3051     jmpb(SCAN_TO_SUBSTR);
3052 
3053     // SP saved at sp+0
3054     // String saved at sp+1*wordSize
3055     // Substr saved at sp+2*wordSize
3056     // Substr count saved at sp+3*wordSize
3057 
3058     // Reload substr for rescan, this code
3059     // is executed only for large substrings (> 8 chars)
3060     bind(RELOAD_SUBSTR);
3061     movptr(str2, Address(rsp, 2*wordSize));
3062     movl(cnt2, Address(rsp, 3*wordSize));
3063     if (ae == StrIntrinsicNode::UL) {
3064       pmovzxbw(vec, Address(str2, 0));
3065     } else {
3066       movdqu(vec, Address(str2, 0));
3067     }
3068     // We came here after the beginning of the substring was
3069     // matched but the rest of it was not so we need to search
3070     // again. Start from the next element after the previous match.
3071     subptr(str1, result); // Restore counter
3072     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3073       shrl(str1, 1);
3074     }
3075     addl(cnt1, str1);
3076     decrementl(cnt1);   // Shift to next element
3077     cmpl(cnt1, cnt2);
3078     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3079 
3080     addptr(result, (1<<scale1));
3081   } // non constant
3082 
3083   // Scan string for start of substr in 16-byte vectors
3084   bind(SCAN_TO_SUBSTR);
3085   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3086   pcmpestri(vec, Address(result, 0), mode);
3087   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3088   subl(cnt1, stride);
3089   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3090   cmpl(cnt1, cnt2);
3091   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3092   addptr(result, 16);
3093 
3094   bind(ADJUST_STR);
3095   cmpl(cnt1, stride); // Do not read beyond string
3096   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3097   // Back-up string to avoid reading beyond string.
3098   lea(result, Address(result, cnt1, scale1, -16));
3099   movl(cnt1, stride);
3100   jmpb(SCAN_TO_SUBSTR);
3101 
3102   // Found a potential substr
3103   bind(FOUND_CANDIDATE);
3104   // After pcmpestri tmp(rcx) contains matched element index
3105 
3106   // Make sure string is still long enough
3107   subl(cnt1, tmp);
3108   cmpl(cnt1, cnt2);
3109   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3110   // Left less then substring.
3111 
3112   bind(RET_NOT_FOUND);
3113   movl(result, -1);
3114   jmp(CLEANUP);
3115 
3116   bind(FOUND_SUBSTR);
3117   // Compute start addr of substr
3118   lea(result, Address(result, tmp, scale1));
3119   if (int_cnt2 > 0) { // Constant substring
3120     // Repeat search for small substring (< 8 chars)
3121     // from new point without reloading substring.
3122     // Have to check that we don't read beyond string.
3123     cmpl(tmp, stride-int_cnt2);
3124     jccb(Assembler::greater, ADJUST_STR);
3125     // Fall through if matched whole substring.
3126   } else { // non constant
3127     assert(int_cnt2 == -1, "should be != 0");
3128 
3129     addl(tmp, cnt2);
3130     // Found result if we matched whole substring.
3131     cmpl(tmp, stride);
3132     jcc(Assembler::lessEqual, RET_FOUND);
3133 
3134     // Repeat search for small substring (<= 8 chars)
3135     // from new point 'str1' without reloading substring.
3136     cmpl(cnt2, stride);
3137     // Have to check that we don't read beyond string.
3138     jccb(Assembler::lessEqual, ADJUST_STR);
3139 
3140     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3141     // Compare the rest of substring (> 8 chars).
3142     movptr(str1, result);
3143 
3144     cmpl(tmp, cnt2);
3145     // First 8 chars are already matched.
3146     jccb(Assembler::equal, CHECK_NEXT);
3147 
3148     bind(SCAN_SUBSTR);
3149     pcmpestri(vec, Address(str1, 0), mode);
3150     // Need to reload strings pointers if not matched whole vector
3151     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3152 
3153     bind(CHECK_NEXT);
3154     subl(cnt2, stride);
3155     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3156     addptr(str1, 16);
3157     if (ae == StrIntrinsicNode::UL) {
3158       addptr(str2, 8);
3159     } else {
3160       addptr(str2, 16);
3161     }
3162     subl(cnt1, stride);
3163     cmpl(cnt2, stride); // Do not read beyond substring
3164     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3165     // Back-up strings to avoid reading beyond substring.
3166 
3167     if (ae == StrIntrinsicNode::UL) {
3168       lea(str2, Address(str2, cnt2, scale2, -8));
3169       lea(str1, Address(str1, cnt2, scale1, -16));
3170     } else {
3171       lea(str2, Address(str2, cnt2, scale2, -16));
3172       lea(str1, Address(str1, cnt2, scale1, -16));
3173     }
3174     subl(cnt1, cnt2);
3175     movl(cnt2, stride);
3176     addl(cnt1, stride);
3177     bind(CONT_SCAN_SUBSTR);
3178     if (ae == StrIntrinsicNode::UL) {
3179       pmovzxbw(vec, Address(str2, 0));
3180     } else {
3181       movdqu(vec, Address(str2, 0));
3182     }
3183     jmp(SCAN_SUBSTR);
3184 
3185     bind(RET_FOUND_LONG);
3186     movptr(str1, Address(rsp, wordSize));
3187   } // non constant
3188 
3189   bind(RET_FOUND);
3190   // Compute substr offset
3191   subptr(result, str1);
3192   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3193     shrl(result, 1); // index
3194   }
3195   bind(CLEANUP);
3196   pop(rsp); // restore SP
3197 
3198 } // string_indexof
3199 
3200 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3201                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3202   ShortBranchVerifier sbv(this);
3203   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3204 
3205   int stride = 8;
3206 
3207   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3208         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3209         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3210         FOUND_SEQ_CHAR, DONE_LABEL;
3211 
3212   movptr(result, str1);
3213   if (UseAVX >= 2) {
3214     cmpl(cnt1, stride);
3215     jcc(Assembler::less, SCAN_TO_CHAR);
3216     cmpl(cnt1, 2*stride);
3217     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3218     movdl(vec1, ch);
3219     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3220     vpxor(vec2, vec2);
3221     movl(tmp, cnt1);
3222     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3223     andl(cnt1,0x0000000F);  //tail count (in chars)
3224 
3225     bind(SCAN_TO_16_CHAR_LOOP);
3226     vmovdqu(vec3, Address(result, 0));
3227     vpcmpeqw(vec3, vec3, vec1, 1);
3228     vptest(vec2, vec3);
3229     jcc(Assembler::carryClear, FOUND_CHAR);
3230     addptr(result, 32);
3231     subl(tmp, 2*stride);
3232     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3233     jmp(SCAN_TO_8_CHAR);
3234     bind(SCAN_TO_8_CHAR_INIT);
3235     movdl(vec1, ch);
3236     pshuflw(vec1, vec1, 0x00);
3237     pshufd(vec1, vec1, 0);
3238     pxor(vec2, vec2);
3239   }
3240   bind(SCAN_TO_8_CHAR);
3241   cmpl(cnt1, stride);
3242   jcc(Assembler::less, SCAN_TO_CHAR);
3243   if (UseAVX < 2) {
3244     movdl(vec1, ch);
3245     pshuflw(vec1, vec1, 0x00);
3246     pshufd(vec1, vec1, 0);
3247     pxor(vec2, vec2);
3248   }
3249   movl(tmp, cnt1);
3250   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3251   andl(cnt1,0x00000007);  //tail count (in chars)
3252 
3253   bind(SCAN_TO_8_CHAR_LOOP);
3254   movdqu(vec3, Address(result, 0));
3255   pcmpeqw(vec3, vec1);
3256   ptest(vec2, vec3);
3257   jcc(Assembler::carryClear, FOUND_CHAR);
3258   addptr(result, 16);
3259   subl(tmp, stride);
3260   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3261   bind(SCAN_TO_CHAR);
3262   testl(cnt1, cnt1);
3263   jcc(Assembler::zero, RET_NOT_FOUND);
3264   bind(SCAN_TO_CHAR_LOOP);
3265   load_unsigned_short(tmp, Address(result, 0));
3266   cmpl(ch, tmp);
3267   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3268   addptr(result, 2);
3269   subl(cnt1, 1);
3270   jccb(Assembler::zero, RET_NOT_FOUND);
3271   jmp(SCAN_TO_CHAR_LOOP);
3272 
3273   bind(RET_NOT_FOUND);
3274   movl(result, -1);
3275   jmpb(DONE_LABEL);
3276 
3277   bind(FOUND_CHAR);
3278   if (UseAVX >= 2) {
3279     vpmovmskb(tmp, vec3);
3280   } else {
3281     pmovmskb(tmp, vec3);
3282   }
3283   bsfl(ch, tmp);
3284   addptr(result, ch);
3285 
3286   bind(FOUND_SEQ_CHAR);
3287   subptr(result, str1);
3288   shrl(result, 1);
3289 
3290   bind(DONE_LABEL);
3291 } // string_indexof_char
3292 
3293 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3294                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3295   ShortBranchVerifier sbv(this);
3296   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3297 
3298   int stride = 16;
3299 
3300   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3301         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3302         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3303         FOUND_SEQ_CHAR, DONE_LABEL;
3304 
3305   movptr(result, str1);
3306   if (UseAVX >= 2) {
3307     cmpl(cnt1, stride);
3308     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3309     cmpl(cnt1, stride*2);
3310     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3311     movdl(vec1, ch);
3312     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3313     vpxor(vec2, vec2);
3314     movl(tmp, cnt1);
3315     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3316     andl(cnt1,0x0000001F);  //tail count (in chars)
3317 
3318     bind(SCAN_TO_32_CHAR_LOOP);
3319     vmovdqu(vec3, Address(result, 0));
3320     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3321     vptest(vec2, vec3);
3322     jcc(Assembler::carryClear, FOUND_CHAR);
3323     addptr(result, 32);
3324     subl(tmp, stride*2);
3325     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3326     jmp(SCAN_TO_16_CHAR);
3327 
3328     bind(SCAN_TO_16_CHAR_INIT);
3329     movdl(vec1, ch);
3330     pxor(vec2, vec2);
3331     pshufb(vec1, vec2);
3332   }
3333 
3334   bind(SCAN_TO_16_CHAR);
3335   cmpl(cnt1, stride);
3336   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3337   if (UseAVX < 2) {
3338     movdl(vec1, ch);
3339     pxor(vec2, vec2);
3340     pshufb(vec1, vec2);
3341   }
3342   movl(tmp, cnt1);
3343   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3344   andl(cnt1,0x0000000F);  //tail count (in bytes)
3345 
3346   bind(SCAN_TO_16_CHAR_LOOP);
3347   movdqu(vec3, Address(result, 0));
3348   pcmpeqb(vec3, vec1);
3349   ptest(vec2, vec3);
3350   jcc(Assembler::carryClear, FOUND_CHAR);
3351   addptr(result, 16);
3352   subl(tmp, stride);
3353   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3354 
3355   bind(SCAN_TO_CHAR_INIT);
3356   testl(cnt1, cnt1);
3357   jcc(Assembler::zero, RET_NOT_FOUND);
3358   bind(SCAN_TO_CHAR_LOOP);
3359   load_unsigned_byte(tmp, Address(result, 0));
3360   cmpl(ch, tmp);
3361   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3362   addptr(result, 1);
3363   subl(cnt1, 1);
3364   jccb(Assembler::zero, RET_NOT_FOUND);
3365   jmp(SCAN_TO_CHAR_LOOP);
3366 
3367   bind(RET_NOT_FOUND);
3368   movl(result, -1);
3369   jmpb(DONE_LABEL);
3370 
3371   bind(FOUND_CHAR);
3372   if (UseAVX >= 2) {
3373     vpmovmskb(tmp, vec3);
3374   } else {
3375     pmovmskb(tmp, vec3);
3376   }
3377   bsfl(ch, tmp);
3378   addptr(result, ch);
3379 
3380   bind(FOUND_SEQ_CHAR);
3381   subptr(result, str1);
3382 
3383   bind(DONE_LABEL);
3384 } // stringL_indexof_char
3385 
3386 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3387   switch (eltype) {
3388   case T_BOOLEAN: return sizeof(jboolean);
3389   case T_BYTE:  return sizeof(jbyte);
3390   case T_SHORT: return sizeof(jshort);
3391   case T_CHAR:  return sizeof(jchar);
3392   case T_INT:   return sizeof(jint);
3393   default:
3394     ShouldNotReachHere();
3395     return -1;
3396   }
3397 }
3398 
3399 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3400   switch (eltype) {
3401   // T_BOOLEAN used as surrogate for unsigned byte
3402   case T_BOOLEAN: movzbl(dst, src);   break;
3403   case T_BYTE:    movsbl(dst, src);   break;
3404   case T_SHORT:   movswl(dst, src);   break;
3405   case T_CHAR:    movzwl(dst, src);   break;
3406   case T_INT:     movl(dst, src);     break;
3407   default:
3408     ShouldNotReachHere();
3409   }
3410 }
3411 
3412 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3413   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3414 }
3415 
3416 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3417   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3418 }
3419 
3420 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3421   const int vlen = Assembler::AVX_256bit;
3422   switch (eltype) {
3423   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3424   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3425   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3426   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3427   case T_INT:
3428     // do nothing
3429     break;
3430   default:
3431     ShouldNotReachHere();
3432   }
3433 }
3434 
3435 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3436                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3437                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3438                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3439                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3440                                         BasicType eltype) {
3441   ShortBranchVerifier sbv(this);
3442   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3443   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3444   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3445 
3446   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3447         SHORT_UNROLLED_LOOP_EXIT,
3448         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3449         UNROLLED_VECTOR_LOOP_BEGIN,
3450         END;
3451   switch (eltype) {
3452   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3453   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3454   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3455   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3456   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3457   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3458   }
3459 
3460   // For "renaming" for readibility of the code
3461   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3462                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3463                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3464 
3465   const int elsize = arrays_hashcode_elsize(eltype);
3466 
3467   /*
3468     if (cnt1 >= 2) {
3469       if (cnt1 >= 32) {
3470         UNROLLED VECTOR LOOP
3471       }
3472       UNROLLED SCALAR LOOP
3473     }
3474     SINGLE SCALAR
3475    */
3476 
3477   cmpl(cnt1, 32);
3478   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3479 
3480   // cnt1 >= 32 && generate_vectorized_loop
3481   xorl(index, index);
3482 
3483   // vresult = IntVector.zero(I256);
3484   for (int idx = 0; idx < 4; idx++) {
3485     vpxor(vresult[idx], vresult[idx]);
3486   }
3487   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3488   Register bound = tmp2;
3489   Register next = tmp3;
3490   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3491   movl(next, Address(tmp2, 0));
3492   movdl(vnext, next);
3493   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3494 
3495   // index = 0;
3496   // bound = cnt1 & ~(32 - 1);
3497   movl(bound, cnt1);
3498   andl(bound, ~(32 - 1));
3499   // for (; index < bound; index += 32) {
3500   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3501   // result *= next;
3502   imull(result, next);
3503   // loop fission to upfront the cost of fetching from memory, OOO execution
3504   // can then hopefully do a better job of prefetching
3505   for (int idx = 0; idx < 4; idx++) {
3506     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3507   }
3508   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3509   for (int idx = 0; idx < 4; idx++) {
3510     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3511     arrays_hashcode_elvcast(vtmp[idx], eltype);
3512     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3513   }
3514   // index += 32;
3515   addl(index, 32);
3516   // index < bound;
3517   cmpl(index, bound);
3518   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3519   // }
3520 
3521   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3522   subl(cnt1, bound);
3523   // release bound
3524 
3525   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3526   for (int idx = 0; idx < 4; idx++) {
3527     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3528     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3529     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3530   }
3531   // result += vresult.reduceLanes(ADD);
3532   for (int idx = 0; idx < 4; idx++) {
3533     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3534   }
3535 
3536   // } else if (cnt1 < 32) {
3537 
3538   bind(SHORT_UNROLLED_BEGIN);
3539   // int i = 1;
3540   movl(index, 1);
3541   cmpl(index, cnt1);
3542   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3543 
3544   // for (; i < cnt1 ; i += 2) {
3545   bind(SHORT_UNROLLED_LOOP_BEGIN);
3546   movl(tmp3, 961);
3547   imull(result, tmp3);
3548   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3549   movl(tmp3, tmp2);
3550   shll(tmp3, 5);
3551   subl(tmp3, tmp2);
3552   addl(result, tmp3);
3553   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3554   addl(result, tmp3);
3555   addl(index, 2);
3556   cmpl(index, cnt1);
3557   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3558 
3559   // }
3560   // if (i >= cnt1) {
3561   bind(SHORT_UNROLLED_LOOP_EXIT);
3562   jccb(Assembler::greater, END);
3563   movl(tmp2, result);
3564   shll(result, 5);
3565   subl(result, tmp2);
3566   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3567   addl(result, tmp3);
3568   // }
3569   bind(END);
3570 
3571   BLOCK_COMMENT("} // arrays_hashcode");
3572 
3573 } // arrays_hashcode
3574 
3575 // helper function for string_compare
3576 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3577                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3578                                            Address::ScaleFactor scale2, Register index, int ae) {
3579   if (ae == StrIntrinsicNode::LL) {
3580     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3581     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3582   } else if (ae == StrIntrinsicNode::UU) {
3583     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3584     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3585   } else {
3586     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3587     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3588   }
3589 }
3590 
3591 // Compare strings, used for char[] and byte[].
3592 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3593                                        Register cnt1, Register cnt2, Register result,
3594                                        XMMRegister vec1, int ae, KRegister mask) {
3595   ShortBranchVerifier sbv(this);
3596   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3597   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only AVX3
3598   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3599   int stride2x2 = 0x40;
3600   Address::ScaleFactor scale = Address::no_scale;
3601   Address::ScaleFactor scale1 = Address::no_scale;
3602   Address::ScaleFactor scale2 = Address::no_scale;
3603 
3604   if (ae != StrIntrinsicNode::LL) {
3605     stride2x2 = 0x20;
3606   }
3607 
3608   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3609     shrl(cnt2, 1);
3610   }
3611   // Compute the minimum of the string lengths and the
3612   // difference of the string lengths (stack).
3613   // Do the conditional move stuff
3614   movl(result, cnt1);
3615   subl(cnt1, cnt2);
3616   push(cnt1);
3617   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3618 
3619   // Is the minimum length zero?
3620   testl(cnt2, cnt2);
3621   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3622   if (ae == StrIntrinsicNode::LL) {
3623     // Load first bytes
3624     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3625     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3626   } else if (ae == StrIntrinsicNode::UU) {
3627     // Load first characters
3628     load_unsigned_short(result, Address(str1, 0));
3629     load_unsigned_short(cnt1, Address(str2, 0));
3630   } else {
3631     load_unsigned_byte(result, Address(str1, 0));
3632     load_unsigned_short(cnt1, Address(str2, 0));
3633   }
3634   subl(result, cnt1);
3635   jcc(Assembler::notZero,  POP_LABEL);
3636 
3637   if (ae == StrIntrinsicNode::UU) {
3638     // Divide length by 2 to get number of chars
3639     shrl(cnt2, 1);
3640   }
3641   cmpl(cnt2, 1);
3642   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3643 
3644   // Check if the strings start at the same location and setup scale and stride
3645   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3646     cmpptr(str1, str2);
3647     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3648     if (ae == StrIntrinsicNode::LL) {
3649       scale = Address::times_1;
3650       stride = 16;
3651     } else {
3652       scale = Address::times_2;
3653       stride = 8;
3654     }
3655   } else {
3656     scale1 = Address::times_1;
3657     scale2 = Address::times_2;
3658     // scale not used
3659     stride = 8;
3660   }
3661 
3662   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3663     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3664     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3665     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3666     Label COMPARE_TAIL_LONG;
3667     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only AVX3
3668 
3669     int pcmpmask = 0x19;
3670     if (ae == StrIntrinsicNode::LL) {
3671       pcmpmask &= ~0x01;
3672     }
3673 
3674     // Setup to compare 16-chars (32-bytes) vectors,
3675     // start from first character again because it has aligned address.
3676     if (ae == StrIntrinsicNode::LL) {
3677       stride2 = 32;
3678     } else {
3679       stride2 = 16;
3680     }
3681     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3682       adr_stride = stride << scale;
3683     } else {
3684       adr_stride1 = 8;  //stride << scale1;
3685       adr_stride2 = 16; //stride << scale2;
3686     }
3687 
3688     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3689     // rax and rdx are used by pcmpestri as elements counters
3690     movl(result, cnt2);
3691     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3692     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3693 
3694     // fast path : compare first 2 8-char vectors.
3695     bind(COMPARE_16_CHARS);
3696     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3697       movdqu(vec1, Address(str1, 0));
3698     } else {
3699       pmovzxbw(vec1, Address(str1, 0));
3700     }
3701     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3702     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3703 
3704     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3705       movdqu(vec1, Address(str1, adr_stride));
3706       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3707     } else {
3708       pmovzxbw(vec1, Address(str1, adr_stride1));
3709       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3710     }
3711     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3712     addl(cnt1, stride);
3713 
3714     // Compare the characters at index in cnt1
3715     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3716     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3717     subl(result, cnt2);
3718     jmp(POP_LABEL);
3719 
3720     // Setup the registers to start vector comparison loop
3721     bind(COMPARE_WIDE_VECTORS);
3722     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3723       lea(str1, Address(str1, result, scale));
3724       lea(str2, Address(str2, result, scale));
3725     } else {
3726       lea(str1, Address(str1, result, scale1));
3727       lea(str2, Address(str2, result, scale2));
3728     }
3729     subl(result, stride2);
3730     subl(cnt2, stride2);
3731     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3732     negptr(result);
3733 
3734     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3735     bind(COMPARE_WIDE_VECTORS_LOOP);
3736 
3737     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3738       cmpl(cnt2, stride2x2);
3739       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3740       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3741       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3742 
3743       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3744       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3745         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3746         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3747       } else {
3748         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3749         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3750       }
3751       kortestql(mask, mask);
3752       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3753       addptr(result, stride2x2);  // update since we already compared at this addr
3754       subl(cnt2, stride2x2);      // and sub the size too
3755       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3756 
3757       vpxor(vec1, vec1);
3758       jmpb(COMPARE_WIDE_TAIL);
3759     }//if (VM_Version::supports_avx512vlbw())
3760 
3761     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3762     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3763       vmovdqu(vec1, Address(str1, result, scale));
3764       vpxor(vec1, Address(str2, result, scale));
3765     } else {
3766       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3767       vpxor(vec1, Address(str2, result, scale2));
3768     }
3769     vptest(vec1, vec1);
3770     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3771     addptr(result, stride2);
3772     subl(cnt2, stride2);
3773     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3774     // clean upper bits of YMM registers
3775     vpxor(vec1, vec1);
3776 
3777     // compare wide vectors tail
3778     bind(COMPARE_WIDE_TAIL);
3779     testptr(result, result);
3780     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3781 
3782     movl(result, stride2);
3783     movl(cnt2, result);
3784     negptr(result);
3785     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3786 
3787     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3788     bind(VECTOR_NOT_EQUAL);
3789     // clean upper bits of YMM registers
3790     vpxor(vec1, vec1);
3791     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3792       lea(str1, Address(str1, result, scale));
3793       lea(str2, Address(str2, result, scale));
3794     } else {
3795       lea(str1, Address(str1, result, scale1));
3796       lea(str2, Address(str2, result, scale2));
3797     }
3798     jmp(COMPARE_16_CHARS);
3799 
3800     // Compare tail chars, length between 1 to 15 chars
3801     bind(COMPARE_TAIL_LONG);
3802     movl(cnt2, result);
3803     cmpl(cnt2, stride);
3804     jcc(Assembler::less, COMPARE_SMALL_STR);
3805 
3806     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3807       movdqu(vec1, Address(str1, 0));
3808     } else {
3809       pmovzxbw(vec1, Address(str1, 0));
3810     }
3811     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3812     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3813     subptr(cnt2, stride);
3814     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3815     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3816       lea(str1, Address(str1, result, scale));
3817       lea(str2, Address(str2, result, scale));
3818     } else {
3819       lea(str1, Address(str1, result, scale1));
3820       lea(str2, Address(str2, result, scale2));
3821     }
3822     negptr(cnt2);
3823     jmpb(WHILE_HEAD_LABEL);
3824 
3825     bind(COMPARE_SMALL_STR);
3826   } else if (UseSSE42Intrinsics) {
3827     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3828     int pcmpmask = 0x19;
3829     // Setup to compare 8-char (16-byte) vectors,
3830     // start from first character again because it has aligned address.
3831     movl(result, cnt2);
3832     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3833     if (ae == StrIntrinsicNode::LL) {
3834       pcmpmask &= ~0x01;
3835     }
3836     jcc(Assembler::zero, COMPARE_TAIL);
3837     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3838       lea(str1, Address(str1, result, scale));
3839       lea(str2, Address(str2, result, scale));
3840     } else {
3841       lea(str1, Address(str1, result, scale1));
3842       lea(str2, Address(str2, result, scale2));
3843     }
3844     negptr(result);
3845 
3846     // pcmpestri
3847     //   inputs:
3848     //     vec1- substring
3849     //     rax - negative string length (elements count)
3850     //     mem - scanned string
3851     //     rdx - string length (elements count)
3852     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3853     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3854     //   outputs:
3855     //     rcx - first mismatched element index
3856     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3857 
3858     bind(COMPARE_WIDE_VECTORS);
3859     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3860       movdqu(vec1, Address(str1, result, scale));
3861       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3862     } else {
3863       pmovzxbw(vec1, Address(str1, result, scale1));
3864       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3865     }
3866     // After pcmpestri cnt1(rcx) contains mismatched element index
3867 
3868     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3869     addptr(result, stride);
3870     subptr(cnt2, stride);
3871     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3872 
3873     // compare wide vectors tail
3874     testptr(result, result);
3875     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3876 
3877     movl(cnt2, stride);
3878     movl(result, stride);
3879     negptr(result);
3880     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3881       movdqu(vec1, Address(str1, result, scale));
3882       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3883     } else {
3884       pmovzxbw(vec1, Address(str1, result, scale1));
3885       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3886     }
3887     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3888 
3889     // Mismatched characters in the vectors
3890     bind(VECTOR_NOT_EQUAL);
3891     addptr(cnt1, result);
3892     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3893     subl(result, cnt2);
3894     jmpb(POP_LABEL);
3895 
3896     bind(COMPARE_TAIL); // limit is zero
3897     movl(cnt2, result);
3898     // Fallthru to tail compare
3899   }
3900   // Shift str2 and str1 to the end of the arrays, negate min
3901   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3902     lea(str1, Address(str1, cnt2, scale));
3903     lea(str2, Address(str2, cnt2, scale));
3904   } else {
3905     lea(str1, Address(str1, cnt2, scale1));
3906     lea(str2, Address(str2, cnt2, scale2));
3907   }
3908   decrementl(cnt2);  // first character was compared already
3909   negptr(cnt2);
3910 
3911   // Compare the rest of the elements
3912   bind(WHILE_HEAD_LABEL);
3913   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3914   subl(result, cnt1);
3915   jccb(Assembler::notZero, POP_LABEL);
3916   increment(cnt2);
3917   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3918 
3919   // Strings are equal up to min length.  Return the length difference.
3920   bind(LENGTH_DIFF_LABEL);
3921   pop(result);
3922   if (ae == StrIntrinsicNode::UU) {
3923     // Divide diff by 2 to get number of chars
3924     sarl(result, 1);
3925   }
3926   jmpb(DONE_LABEL);
3927 
3928   if (VM_Version::supports_avx512vlbw()) {
3929 
3930     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3931 
3932     kmovql(cnt1, mask);
3933     notq(cnt1);
3934     bsfq(cnt2, cnt1);
3935     if (ae != StrIntrinsicNode::LL) {
3936       // Divide diff by 2 to get number of chars
3937       sarl(cnt2, 1);
3938     }
3939     addq(result, cnt2);
3940     if (ae == StrIntrinsicNode::LL) {
3941       load_unsigned_byte(cnt1, Address(str2, result));
3942       load_unsigned_byte(result, Address(str1, result));
3943     } else if (ae == StrIntrinsicNode::UU) {
3944       load_unsigned_short(cnt1, Address(str2, result, scale));
3945       load_unsigned_short(result, Address(str1, result, scale));
3946     } else {
3947       load_unsigned_short(cnt1, Address(str2, result, scale2));
3948       load_unsigned_byte(result, Address(str1, result, scale1));
3949     }
3950     subl(result, cnt1);
3951     jmpb(POP_LABEL);
3952   }//if (VM_Version::supports_avx512vlbw())
3953 
3954   // Discard the stored length difference
3955   bind(POP_LABEL);
3956   pop(cnt1);
3957 
3958   // That's it
3959   bind(DONE_LABEL);
3960   if(ae == StrIntrinsicNode::UL) {
3961     negl(result);
3962   }
3963 
3964 }
3965 
3966 // Search for Non-ASCII character (Negative byte value) in a byte array,
3967 // return the index of the first such character, otherwise the length
3968 // of the array segment searched.
3969 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3970 //   @IntrinsicCandidate
3971 //   public static int countPositives(byte[] ba, int off, int len) {
3972 //     for (int i = off; i < off + len; i++) {
3973 //       if (ba[i] < 0) {
3974 //         return i - off;
3975 //       }
3976 //     }
3977 //     return len;
3978 //   }
3979 void C2_MacroAssembler::count_positives(Register ary1, Register len,
3980   Register result, Register tmp1,
3981   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3982   // rsi: byte array
3983   // rcx: len
3984   // rax: result
3985   ShortBranchVerifier sbv(this);
3986   assert_different_registers(ary1, len, result, tmp1);
3987   assert_different_registers(vec1, vec2);
3988   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3989 
3990   movl(result, len); // copy
3991   // len == 0
3992   testl(len, len);
3993   jcc(Assembler::zero, DONE);
3994 
3995   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3996     VM_Version::supports_avx512vlbw() &&
3997     VM_Version::supports_bmi2()) {
3998 
3999     Label test_64_loop, test_tail, BREAK_LOOP;
4000     movl(tmp1, len);
4001     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4002 
4003     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4004     andl(len,  0xffffffc0); // vector count (in chars)
4005     jccb(Assembler::zero, test_tail);
4006 
4007     lea(ary1, Address(ary1, len, Address::times_1));
4008     negptr(len);
4009 
4010     bind(test_64_loop);
4011     // Check whether our 64 elements of size byte contain negatives
4012     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4013     kortestql(mask1, mask1);
4014     jcc(Assembler::notZero, BREAK_LOOP);
4015 
4016     addptr(len, 64);
4017     jccb(Assembler::notZero, test_64_loop);
4018 
4019     bind(test_tail);
4020     // bail out when there is nothing to be done
4021     testl(tmp1, -1);
4022     jcc(Assembler::zero, DONE);
4023 
4024 
4025     // check the tail for absense of negatives
4026     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4027     {
4028       Register tmp3_aliased = len;
4029       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4030       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4031       notq(tmp3_aliased);
4032       kmovql(mask2, tmp3_aliased);
4033     }
4034 
4035     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4036     ktestq(mask1, mask2);
4037     jcc(Assembler::zero, DONE);
4038 
4039     // do a full check for negative registers in the tail
4040     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4041                      // ary1 already pointing to the right place
4042     jmpb(TAIL_START);
4043 
4044     bind(BREAK_LOOP);
4045     // At least one byte in the last 64 byte block was negative.
4046     // Set up to look at the last 64 bytes as if they were a tail
4047     lea(ary1, Address(ary1, len, Address::times_1));
4048     addptr(result, len);
4049     // Ignore the very last byte: if all others are positive,
4050     // it must be negative, so we can skip right to the 2+1 byte
4051     // end comparison at this point
4052     orl(result, 63);
4053     movl(len, 63);
4054     // Fallthru to tail compare
4055   } else {
4056 
4057     if (UseAVX >= 2) {
4058       // With AVX2, use 32-byte vector compare
4059       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4060 
4061       // Compare 32-byte vectors
4062       testl(len, 0xffffffe0);   // vector count (in bytes)
4063       jccb(Assembler::zero, TAIL_START);
4064 
4065       andl(len, 0xffffffe0);
4066       lea(ary1, Address(ary1, len, Address::times_1));
4067       negptr(len);
4068 
4069       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4070       movdl(vec2, tmp1);
4071       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4072 
4073       bind(COMPARE_WIDE_VECTORS);
4074       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4075       vptest(vec1, vec2);
4076       jccb(Assembler::notZero, BREAK_LOOP);
4077       addptr(len, 32);
4078       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4079 
4080       testl(result, 0x0000001f);   // any bytes remaining?
4081       jcc(Assembler::zero, DONE);
4082 
4083       // Quick test using the already prepared vector mask
4084       movl(len, result);
4085       andl(len, 0x0000001f);
4086       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4087       vptest(vec1, vec2);
4088       jcc(Assembler::zero, DONE);
4089       // There are zeros, jump to the tail to determine exactly where
4090       jmpb(TAIL_START);
4091 
4092       bind(BREAK_LOOP);
4093       // At least one byte in the last 32-byte vector is negative.
4094       // Set up to look at the last 32 bytes as if they were a tail
4095       lea(ary1, Address(ary1, len, Address::times_1));
4096       addptr(result, len);
4097       // Ignore the very last byte: if all others are positive,
4098       // it must be negative, so we can skip right to the 2+1 byte
4099       // end comparison at this point
4100       orl(result, 31);
4101       movl(len, 31);
4102       // Fallthru to tail compare
4103     } else if (UseSSE42Intrinsics) {
4104       // With SSE4.2, use double quad vector compare
4105       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4106 
4107       // Compare 16-byte vectors
4108       testl(len, 0xfffffff0);   // vector count (in bytes)
4109       jcc(Assembler::zero, TAIL_START);
4110 
4111       andl(len, 0xfffffff0);
4112       lea(ary1, Address(ary1, len, Address::times_1));
4113       negptr(len);
4114 
4115       movl(tmp1, 0x80808080);
4116       movdl(vec2, tmp1);
4117       pshufd(vec2, vec2, 0);
4118 
4119       bind(COMPARE_WIDE_VECTORS);
4120       movdqu(vec1, Address(ary1, len, Address::times_1));
4121       ptest(vec1, vec2);
4122       jccb(Assembler::notZero, BREAK_LOOP);
4123       addptr(len, 16);
4124       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4125 
4126       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4127       jcc(Assembler::zero, DONE);
4128 
4129       // Quick test using the already prepared vector mask
4130       movl(len, result);
4131       andl(len, 0x0000000f);   // tail count (in bytes)
4132       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4133       ptest(vec1, vec2);
4134       jcc(Assembler::zero, DONE);
4135       jmpb(TAIL_START);
4136 
4137       bind(BREAK_LOOP);
4138       // At least one byte in the last 16-byte vector is negative.
4139       // Set up and look at the last 16 bytes as if they were a tail
4140       lea(ary1, Address(ary1, len, Address::times_1));
4141       addptr(result, len);
4142       // Ignore the very last byte: if all others are positive,
4143       // it must be negative, so we can skip right to the 2+1 byte
4144       // end comparison at this point
4145       orl(result, 15);
4146       movl(len, 15);
4147       // Fallthru to tail compare
4148     }
4149   }
4150 
4151   bind(TAIL_START);
4152   // Compare 4-byte vectors
4153   andl(len, 0xfffffffc); // vector count (in bytes)
4154   jccb(Assembler::zero, COMPARE_CHAR);
4155 
4156   lea(ary1, Address(ary1, len, Address::times_1));
4157   negptr(len);
4158 
4159   bind(COMPARE_VECTORS);
4160   movl(tmp1, Address(ary1, len, Address::times_1));
4161   andl(tmp1, 0x80808080);
4162   jccb(Assembler::notZero, TAIL_ADJUST);
4163   addptr(len, 4);
4164   jccb(Assembler::notZero, COMPARE_VECTORS);
4165 
4166   // Compare trailing char (final 2-3 bytes), if any
4167   bind(COMPARE_CHAR);
4168 
4169   testl(result, 0x2);   // tail  char
4170   jccb(Assembler::zero, COMPARE_BYTE);
4171   load_unsigned_short(tmp1, Address(ary1, 0));
4172   andl(tmp1, 0x00008080);
4173   jccb(Assembler::notZero, CHAR_ADJUST);
4174   lea(ary1, Address(ary1, 2));
4175 
4176   bind(COMPARE_BYTE);
4177   testl(result, 0x1);   // tail  byte
4178   jccb(Assembler::zero, DONE);
4179   load_unsigned_byte(tmp1, Address(ary1, 0));
4180   testl(tmp1, 0x00000080);
4181   jccb(Assembler::zero, DONE);
4182   subptr(result, 1);
4183   jmpb(DONE);
4184 
4185   bind(TAIL_ADJUST);
4186   // there are negative bits in the last 4 byte block.
4187   // Adjust result and check the next three bytes
4188   addptr(result, len);
4189   orl(result, 3);
4190   lea(ary1, Address(ary1, len, Address::times_1));
4191   jmpb(COMPARE_CHAR);
4192 
4193   bind(CHAR_ADJUST);
4194   // We are looking at a char + optional byte tail, and found that one
4195   // of the bytes in the char is negative. Adjust the result, check the
4196   // first byte and readjust if needed.
4197   andl(result, 0xfffffffc);
4198   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4199   jccb(Assembler::notZero, DONE);
4200   addptr(result, 1);
4201 
4202   // That's it
4203   bind(DONE);
4204   if (UseAVX >= 2) {
4205     // clean upper bits of YMM registers
4206     vpxor(vec1, vec1);
4207     vpxor(vec2, vec2);
4208   }
4209 }
4210 
4211 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4212 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4213                                       Register limit, Register result, Register chr,
4214                                       XMMRegister vec1, XMMRegister vec2, bool is_char,
4215                                       KRegister mask, bool expand_ary2) {
4216   // for expand_ary2, limit is the (smaller) size of the second array.
4217   ShortBranchVerifier sbv(this);
4218   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4219 
4220   assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4221          "Expansion only implemented for AVX2");
4222 
4223   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4224   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4225 
4226   Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4227   int scaleIncr = expand_ary2 ? 8 : 16;
4228 
4229   if (is_array_equ) {
4230     // Check the input args
4231     cmpoop(ary1, ary2);
4232     jcc(Assembler::equal, TRUE_LABEL);
4233 
4234     // Need additional checks for arrays_equals.
4235     testptr(ary1, ary1);
4236     jcc(Assembler::zero, FALSE_LABEL);
4237     testptr(ary2, ary2);
4238     jcc(Assembler::zero, FALSE_LABEL);
4239 
4240     // Check the lengths
4241     movl(limit, Address(ary1, length_offset));
4242     cmpl(limit, Address(ary2, length_offset));
4243     jcc(Assembler::notEqual, FALSE_LABEL);
4244   }
4245 
4246   // count == 0
4247   testl(limit, limit);
4248   jcc(Assembler::zero, TRUE_LABEL);
4249 
4250   if (is_array_equ) {
4251     // Load array address
4252     lea(ary1, Address(ary1, base_offset));
4253     lea(ary2, Address(ary2, base_offset));
4254   }
4255 
4256   if (is_array_equ && is_char) {
4257     // arrays_equals when used for char[].
4258     shll(limit, 1);      // byte count != 0
4259   }
4260   movl(result, limit); // copy
4261 
4262   if (UseAVX >= 2) {
4263     // With AVX2, use 32-byte vector compare
4264     Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4265 
4266     // Compare 32-byte vectors
4267     if (expand_ary2) {
4268       andl(result, 0x0000000f);  //   tail count (in bytes)
4269       andl(limit, 0xfffffff0);   // vector count (in bytes)
4270       jcc(Assembler::zero, COMPARE_TAIL);
4271     } else {
4272       andl(result, 0x0000001f);  //   tail count (in bytes)
4273       andl(limit, 0xffffffe0);   // vector count (in bytes)
4274       jcc(Assembler::zero, COMPARE_TAIL_16);
4275     }
4276 
4277     lea(ary1, Address(ary1, limit, scaleFactor));
4278     lea(ary2, Address(ary2, limit, Address::times_1));
4279     negptr(limit);
4280 
4281     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4282       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4283 
4284       cmpl(limit, -64);
4285       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4286 
4287       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4288 
4289       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4290       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4291       kortestql(mask, mask);
4292       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4293       addptr(limit, 64);  // update since we already compared at this addr
4294       cmpl(limit, -64);
4295       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4296 
4297       // At this point we may still need to compare -limit+result bytes.
4298       // We could execute the next two instruction and just continue via non-wide path:
4299       //  cmpl(limit, 0);
4300       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4301       // But since we stopped at the points ary{1,2}+limit which are
4302       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4303       // (|limit| <= 32 and result < 32),
4304       // we may just compare the last 64 bytes.
4305       //
4306       addptr(result, -64);   // it is safe, bc we just came from this area
4307       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4308       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4309       kortestql(mask, mask);
4310       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4311 
4312       jmp(TRUE_LABEL);
4313 
4314       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4315 
4316     }//if (VM_Version::supports_avx512vlbw())
4317 
4318     bind(COMPARE_WIDE_VECTORS);
4319     vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4320     if (expand_ary2) {
4321       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4322     } else {
4323       vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4324     }
4325     vpxor(vec1, vec2);
4326 
4327     vptest(vec1, vec1);
4328     jcc(Assembler::notZero, FALSE_LABEL);
4329     addptr(limit, scaleIncr * 2);
4330     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4331 
4332     testl(result, result);
4333     jcc(Assembler::zero, TRUE_LABEL);
4334 
4335     vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4336     if (expand_ary2) {
4337       vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4338     } else {
4339       vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4340     }
4341     vpxor(vec1, vec2);
4342 
4343     vptest(vec1, vec1);
4344     jcc(Assembler::notZero, FALSE_LABEL);
4345     jmp(TRUE_LABEL);
4346 
4347     bind(COMPARE_TAIL_16); // limit is zero
4348     movl(limit, result);
4349 
4350     // Compare 16-byte chunks
4351     andl(result, 0x0000000f);  //   tail count (in bytes)
4352     andl(limit, 0xfffffff0);   // vector count (in bytes)
4353     jcc(Assembler::zero, COMPARE_TAIL);
4354 
4355     lea(ary1, Address(ary1, limit, scaleFactor));
4356     lea(ary2, Address(ary2, limit, Address::times_1));
4357     negptr(limit);
4358 
4359     bind(COMPARE_WIDE_VECTORS_16);
4360     movdqu(vec1, Address(ary1, limit, scaleFactor));
4361     if (expand_ary2) {
4362       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4363     } else {
4364       movdqu(vec2, Address(ary2, limit, Address::times_1));
4365     }
4366     pxor(vec1, vec2);
4367 
4368     ptest(vec1, vec1);
4369     jcc(Assembler::notZero, FALSE_LABEL);
4370     addptr(limit, scaleIncr);
4371     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4372 
4373     bind(COMPARE_TAIL); // limit is zero
4374     movl(limit, result);
4375     // Fallthru to tail compare
4376   } else if (UseSSE42Intrinsics) {
4377     // With SSE4.2, use double quad vector compare
4378     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4379 
4380     // Compare 16-byte vectors
4381     andl(result, 0x0000000f);  //   tail count (in bytes)
4382     andl(limit, 0xfffffff0);   // vector count (in bytes)
4383     jcc(Assembler::zero, COMPARE_TAIL);
4384 
4385     lea(ary1, Address(ary1, limit, Address::times_1));
4386     lea(ary2, Address(ary2, limit, Address::times_1));
4387     negptr(limit);
4388 
4389     bind(COMPARE_WIDE_VECTORS);
4390     movdqu(vec1, Address(ary1, limit, Address::times_1));
4391     movdqu(vec2, Address(ary2, limit, Address::times_1));
4392     pxor(vec1, vec2);
4393 
4394     ptest(vec1, vec1);
4395     jcc(Assembler::notZero, FALSE_LABEL);
4396     addptr(limit, 16);
4397     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4398 
4399     testl(result, result);
4400     jcc(Assembler::zero, TRUE_LABEL);
4401 
4402     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4403     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4404     pxor(vec1, vec2);
4405 
4406     ptest(vec1, vec1);
4407     jccb(Assembler::notZero, FALSE_LABEL);
4408     jmpb(TRUE_LABEL);
4409 
4410     bind(COMPARE_TAIL); // limit is zero
4411     movl(limit, result);
4412     // Fallthru to tail compare
4413   }
4414 
4415   // Compare 4-byte vectors
4416   if (expand_ary2) {
4417     testl(result, result);
4418     jccb(Assembler::zero, TRUE_LABEL);
4419   } else {
4420     andl(limit, 0xfffffffc); // vector count (in bytes)
4421     jccb(Assembler::zero, COMPARE_CHAR);
4422   }
4423 
4424   lea(ary1, Address(ary1, limit, scaleFactor));
4425   lea(ary2, Address(ary2, limit, Address::times_1));
4426   negptr(limit);
4427 
4428   bind(COMPARE_VECTORS);
4429   if (expand_ary2) {
4430     // There are no "vector" operations for bytes to shorts
4431     movzbl(chr, Address(ary2, limit, Address::times_1));
4432     cmpw(Address(ary1, limit, Address::times_2), chr);
4433     jccb(Assembler::notEqual, FALSE_LABEL);
4434     addptr(limit, 1);
4435     jcc(Assembler::notZero, COMPARE_VECTORS);
4436     jmp(TRUE_LABEL);
4437   } else {
4438     movl(chr, Address(ary1, limit, Address::times_1));
4439     cmpl(chr, Address(ary2, limit, Address::times_1));
4440     jccb(Assembler::notEqual, FALSE_LABEL);
4441     addptr(limit, 4);
4442     jcc(Assembler::notZero, COMPARE_VECTORS);
4443   }
4444 
4445   // Compare trailing char (final 2 bytes), if any
4446   bind(COMPARE_CHAR);
4447   testl(result, 0x2);   // tail  char
4448   jccb(Assembler::zero, COMPARE_BYTE);
4449   load_unsigned_short(chr, Address(ary1, 0));
4450   load_unsigned_short(limit, Address(ary2, 0));
4451   cmpl(chr, limit);
4452   jccb(Assembler::notEqual, FALSE_LABEL);
4453 
4454   if (is_array_equ && is_char) {
4455     bind(COMPARE_BYTE);
4456   } else {
4457     lea(ary1, Address(ary1, 2));
4458     lea(ary2, Address(ary2, 2));
4459 
4460     bind(COMPARE_BYTE);
4461     testl(result, 0x1);   // tail  byte
4462     jccb(Assembler::zero, TRUE_LABEL);
4463     load_unsigned_byte(chr, Address(ary1, 0));
4464     load_unsigned_byte(limit, Address(ary2, 0));
4465     cmpl(chr, limit);
4466     jccb(Assembler::notEqual, FALSE_LABEL);
4467   }
4468   bind(TRUE_LABEL);
4469   movl(result, 1);   // return true
4470   jmpb(DONE);
4471 
4472   bind(FALSE_LABEL);
4473   xorl(result, result); // return false
4474 
4475   // That's it
4476   bind(DONE);
4477   if (UseAVX >= 2) {
4478     // clean upper bits of YMM registers
4479     vpxor(vec1, vec1);
4480     vpxor(vec2, vec2);
4481   }
4482 }
4483 
4484 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4485 #define __ masm.
4486   Register dst = stub.data<0>();
4487   XMMRegister src = stub.data<1>();
4488   address target = stub.data<2>();
4489   __ bind(stub.entry());
4490   __ subptr(rsp, 8);
4491   __ movdbl(Address(rsp), src);
4492   __ call(RuntimeAddress(target));
4493   // APX REX2 encoding for pop(dst) increases the stub size by 1 byte.
4494   __ pop(dst);
4495   __ jmp(stub.continuation());
4496 #undef __
4497 }
4498 
4499 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4500   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4501   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4502 
4503   address slowpath_target;
4504   if (dst_bt == T_INT) {
4505     if (src_bt == T_FLOAT) {
4506       cvttss2sil(dst, src);
4507       cmpl(dst, 0x80000000);
4508       slowpath_target = StubRoutines::x86::f2i_fixup();
4509     } else {
4510       cvttsd2sil(dst, src);
4511       cmpl(dst, 0x80000000);
4512       slowpath_target = StubRoutines::x86::d2i_fixup();
4513     }
4514   } else {
4515     if (src_bt == T_FLOAT) {
4516       cvttss2siq(dst, src);
4517       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4518       slowpath_target = StubRoutines::x86::f2l_fixup();
4519     } else {
4520       cvttsd2siq(dst, src);
4521       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4522       slowpath_target = StubRoutines::x86::d2l_fixup();
4523     }
4524   }
4525 
4526   // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte.
4527   int max_size = 23 + (UseAPX ? 1 : 0);
4528   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath);
4529   jcc(Assembler::equal, stub->entry());
4530   bind(stub->continuation());
4531 }
4532 
4533 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4534                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4535   switch(ideal_opc) {
4536     case Op_LShiftVS:
4537       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4538     case Op_LShiftVI:
4539       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4540     case Op_LShiftVL:
4541       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4542     case Op_RShiftVS:
4543       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4544     case Op_RShiftVI:
4545       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4546     case Op_RShiftVL:
4547       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4548     case Op_URShiftVS:
4549       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4550     case Op_URShiftVI:
4551       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4552     case Op_URShiftVL:
4553       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4554     case Op_RotateRightV:
4555       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4556     case Op_RotateLeftV:
4557       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4558     default:
4559       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4560       break;
4561   }
4562 }
4563 
4564 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4565                                                XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4566   if (is_unsigned) {
4567     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4568   } else {
4569     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4570   }
4571 }
4572 
4573 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4574                                                       XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4575   switch (elem_bt) {
4576     case T_BYTE:
4577       if (ideal_opc == Op_SaturatingAddV) {
4578         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4579       } else {
4580         assert(ideal_opc == Op_SaturatingSubV, "");
4581         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4582       }
4583       break;
4584     case T_SHORT:
4585       if (ideal_opc == Op_SaturatingAddV) {
4586         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4587       } else {
4588         assert(ideal_opc == Op_SaturatingSubV, "");
4589         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4590       }
4591       break;
4592     default:
4593       fatal("Unsupported type %s", type2name(elem_bt));
4594       break;
4595   }
4596 }
4597 
4598 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4599                                                         XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4600   switch (elem_bt) {
4601     case T_BYTE:
4602       if (ideal_opc == Op_SaturatingAddV) {
4603         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4604       } else {
4605         assert(ideal_opc == Op_SaturatingSubV, "");
4606         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4607       }
4608       break;
4609     case T_SHORT:
4610       if (ideal_opc == Op_SaturatingAddV) {
4611         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4612       } else {
4613         assert(ideal_opc == Op_SaturatingSubV, "");
4614         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4615       }
4616       break;
4617     default:
4618       fatal("Unsupported type %s", type2name(elem_bt));
4619       break;
4620   }
4621 }
4622 
4623 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4624                                                Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4625   if (is_unsigned) {
4626     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4627   } else {
4628     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4629   }
4630 }
4631 
4632 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4633                                                       XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4634   switch (elem_bt) {
4635     case T_BYTE:
4636       if (ideal_opc == Op_SaturatingAddV) {
4637         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4638       } else {
4639         assert(ideal_opc == Op_SaturatingSubV, "");
4640         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4641       }
4642       break;
4643     case T_SHORT:
4644       if (ideal_opc == Op_SaturatingAddV) {
4645         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4646       } else {
4647         assert(ideal_opc == Op_SaturatingSubV, "");
4648         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4649       }
4650       break;
4651     default:
4652       fatal("Unsupported type %s", type2name(elem_bt));
4653       break;
4654   }
4655 }
4656 
4657 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4658                                                         XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4659   switch (elem_bt) {
4660     case T_BYTE:
4661       if (ideal_opc == Op_SaturatingAddV) {
4662         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4663       } else {
4664         assert(ideal_opc == Op_SaturatingSubV, "");
4665         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4666       }
4667       break;
4668     case T_SHORT:
4669       if (ideal_opc == Op_SaturatingAddV) {
4670         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4671       } else {
4672         assert(ideal_opc == Op_SaturatingSubV, "");
4673         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4674       }
4675       break;
4676     default:
4677       fatal("Unsupported type %s", type2name(elem_bt));
4678       break;
4679   }
4680 }
4681 
4682 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4683                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4684                                     bool is_varshift) {
4685   switch (ideal_opc) {
4686     case Op_AddVB:
4687       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4688     case Op_AddVS:
4689       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4690     case Op_AddVI:
4691       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4692     case Op_AddVL:
4693       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4694     case Op_AddVF:
4695       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4696     case Op_AddVD:
4697       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4698     case Op_SubVB:
4699       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4700     case Op_SubVS:
4701       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4702     case Op_SubVI:
4703       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4704     case Op_SubVL:
4705       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4706     case Op_SubVF:
4707       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4708     case Op_SubVD:
4709       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4710     case Op_MulVS:
4711       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4712     case Op_MulVI:
4713       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4714     case Op_MulVL:
4715       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4716     case Op_MulVF:
4717       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4718     case Op_MulVD:
4719       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4720     case Op_DivVF:
4721       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4722     case Op_DivVD:
4723       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4724     case Op_SqrtVF:
4725       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4726     case Op_SqrtVD:
4727       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4728     case Op_AbsVB:
4729       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4730     case Op_AbsVS:
4731       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4732     case Op_AbsVI:
4733       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4734     case Op_AbsVL:
4735       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4736     case Op_FmaVF:
4737       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4738     case Op_FmaVD:
4739       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4740     case Op_VectorRearrange:
4741       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4742     case Op_LShiftVS:
4743       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4744     case Op_LShiftVI:
4745       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4746     case Op_LShiftVL:
4747       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4748     case Op_RShiftVS:
4749       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4750     case Op_RShiftVI:
4751       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4752     case Op_RShiftVL:
4753       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4754     case Op_URShiftVS:
4755       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4756     case Op_URShiftVI:
4757       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4758     case Op_URShiftVL:
4759       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4760     case Op_RotateLeftV:
4761       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4762     case Op_RotateRightV:
4763       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4764     case Op_MaxV:
4765       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4766     case Op_MinV:
4767       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4768     case Op_UMinV:
4769       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4770     case Op_UMaxV:
4771       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4772     case Op_XorV:
4773       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4774     case Op_OrV:
4775       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4776     case Op_AndV:
4777       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4778     default:
4779       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4780       break;
4781   }
4782 }
4783 
4784 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4785                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4786   switch (ideal_opc) {
4787     case Op_AddVB:
4788       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4789     case Op_AddVS:
4790       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4791     case Op_AddVI:
4792       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4793     case Op_AddVL:
4794       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4795     case Op_AddVF:
4796       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4797     case Op_AddVD:
4798       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4799     case Op_SubVB:
4800       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4801     case Op_SubVS:
4802       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4803     case Op_SubVI:
4804       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4805     case Op_SubVL:
4806       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4807     case Op_SubVF:
4808       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4809     case Op_SubVD:
4810       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4811     case Op_MulVS:
4812       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4813     case Op_MulVI:
4814       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4815     case Op_MulVL:
4816       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4817     case Op_MulVF:
4818       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4819     case Op_MulVD:
4820       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4821     case Op_DivVF:
4822       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4823     case Op_DivVD:
4824       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4825     case Op_FmaVF:
4826       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4827     case Op_FmaVD:
4828       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4829     case Op_MaxV:
4830       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4831     case Op_MinV:
4832       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4833     case Op_UMaxV:
4834       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4835     case Op_UMinV:
4836       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4837     case Op_XorV:
4838       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4839     case Op_OrV:
4840       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4841     case Op_AndV:
4842       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4843     default:
4844       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4845       break;
4846   }
4847 }
4848 
4849 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4850                                   KRegister src1, KRegister src2) {
4851   BasicType etype = T_ILLEGAL;
4852   switch(mask_len) {
4853     case 2:
4854     case 4:
4855     case 8:  etype = T_BYTE; break;
4856     case 16: etype = T_SHORT; break;
4857     case 32: etype = T_INT; break;
4858     case 64: etype = T_LONG; break;
4859     default: fatal("Unsupported type"); break;
4860   }
4861   assert(etype != T_ILLEGAL, "");
4862   switch(ideal_opc) {
4863     case Op_AndVMask:
4864       kand(etype, dst, src1, src2); break;
4865     case Op_OrVMask:
4866       kor(etype, dst, src1, src2); break;
4867     case Op_XorVMask:
4868       kxor(etype, dst, src1, src2); break;
4869     default:
4870       fatal("Unsupported masked operation"); break;
4871   }
4872 }
4873 
4874 /*
4875  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4876  * If src is NaN, the result is 0.
4877  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4878  * the result is equal to the value of Integer.MIN_VALUE.
4879  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4880  * the result is equal to the value of Integer.MAX_VALUE.
4881  */
4882 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4883                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4884                                                                    Register rscratch, AddressLiteral float_sign_flip,
4885                                                                    int vec_enc) {
4886   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4887   Label done;
4888   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4889   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4890   vptest(xtmp2, xtmp2, vec_enc);
4891   jccb(Assembler::equal, done);
4892 
4893   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4894   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4895 
4896   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4897   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4898   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4899 
4900   // Recompute the mask for remaining special value.
4901   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4902   // Extract SRC values corresponding to TRUE mask lanes.
4903   vpand(xtmp4, xtmp2, src, vec_enc);
4904   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4905   // values are set.
4906   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4907 
4908   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4909   bind(done);
4910 }
4911 
4912 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4913                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4914                                                                     Register rscratch, AddressLiteral float_sign_flip,
4915                                                                     int vec_enc) {
4916   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4917   Label done;
4918   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4919   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4920   kortestwl(ktmp1, ktmp1);
4921   jccb(Assembler::equal, done);
4922 
4923   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4924   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4925   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4926 
4927   kxorwl(ktmp1, ktmp1, ktmp2);
4928   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4929   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4930   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4931   bind(done);
4932 }
4933 
4934 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4935                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4936                                                                      Register rscratch, AddressLiteral double_sign_flip,
4937                                                                      int vec_enc) {
4938   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4939 
4940   Label done;
4941   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4942   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4943   kortestwl(ktmp1, ktmp1);
4944   jccb(Assembler::equal, done);
4945 
4946   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4947   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4948   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4949 
4950   kxorwl(ktmp1, ktmp1, ktmp2);
4951   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4952   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4953   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4954   bind(done);
4955 }
4956 
4957 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4958                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4959                                                                      Register rscratch, AddressLiteral float_sign_flip,
4960                                                                      int vec_enc) {
4961   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4962   Label done;
4963   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4964   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4965   kortestwl(ktmp1, ktmp1);
4966   jccb(Assembler::equal, done);
4967 
4968   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4969   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4970   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4971 
4972   kxorwl(ktmp1, ktmp1, ktmp2);
4973   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4974   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4975   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4976   bind(done);
4977 }
4978 
4979 /*
4980  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4981  * If src is NaN, the result is 0.
4982  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
4983  * the result is equal to the value of Long.MIN_VALUE.
4984  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
4985  * the result is equal to the value of Long.MAX_VALUE.
4986  */
4987 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4988                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4989                                                                       Register rscratch, AddressLiteral double_sign_flip,
4990                                                                       int vec_enc) {
4991   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4992 
4993   Label done;
4994   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4995   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
4996   kortestwl(ktmp1, ktmp1);
4997   jccb(Assembler::equal, done);
4998 
4999   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5000   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5001   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5002 
5003   kxorwl(ktmp1, ktmp1, ktmp2);
5004   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5005   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5006   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5007   bind(done);
5008 }
5009 
5010 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5011                                                              XMMRegister xtmp, int index, int vec_enc) {
5012    assert(vec_enc < Assembler::AVX_512bit, "");
5013    if (vec_enc == Assembler::AVX_256bit) {
5014      vextractf128_high(xtmp, src);
5015      vshufps(dst, src, xtmp, index, vec_enc);
5016    } else {
5017      vshufps(dst, src, zero, index, vec_enc);
5018    }
5019 }
5020 
5021 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5022                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5023                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
5024   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5025 
5026   Label done;
5027   // Compare the destination lanes with float_sign_flip
5028   // value to get mask for all special values.
5029   movdqu(xtmp1, float_sign_flip, rscratch);
5030   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5031   ptest(xtmp2, xtmp2);
5032   jccb(Assembler::equal, done);
5033 
5034   // Flip float_sign_flip to get max integer value.
5035   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5036   pxor(xtmp1, xtmp4);
5037 
5038   // Set detination lanes corresponding to unordered source lanes as zero.
5039   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5040   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5041 
5042   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5043   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5044   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5045 
5046   // Recompute the mask for remaining special value.
5047   pxor(xtmp2, xtmp3);
5048   // Extract mask corresponding to non-negative source lanes.
5049   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5050 
5051   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5052   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5053   pand(xtmp3, xtmp2);
5054 
5055   // Replace destination lanes holding special value(0x80000000) with max int
5056   // if corresponding source lane holds a +ve value.
5057   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5058   bind(done);
5059 }
5060 
5061 
5062 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5063                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5064   switch(to_elem_bt) {
5065     case T_SHORT:
5066       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5067       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5068       vpackusdw(dst, dst, zero, vec_enc);
5069       if (vec_enc == Assembler::AVX_256bit) {
5070         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5071       }
5072       break;
5073     case  T_BYTE:
5074       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5075       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5076       vpackusdw(dst, dst, zero, vec_enc);
5077       if (vec_enc == Assembler::AVX_256bit) {
5078         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5079       }
5080       vpackuswb(dst, dst, zero, vec_enc);
5081       break;
5082     default: assert(false, "Unexpected basic type for target of vector cast int to subword: %s", type2name(to_elem_bt));
5083   }
5084 }
5085 
5086 /*
5087  * Algorithm for vector D2L and F2I conversions (AVX 10.2 unsupported):-
5088  * a) Perform vector D2L/F2I cast.
5089  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5090  *    It signifies that source value could be any of the special floating point
5091  *    values(NaN,-Inf,Inf,Max,-Min).
5092  * c) Set destination to zero if source is NaN value.
5093  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5094  */
5095 
5096 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5097                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5098                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5099   int to_elem_sz = type2aelembytes(to_elem_bt);
5100   assert(to_elem_sz <= 4, "");
5101   vcvttps2dq(dst, src, vec_enc);
5102   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5103   if (to_elem_sz < 4) {
5104     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5105     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5106   }
5107 }
5108 
5109 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5110                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5111                                             Register rscratch, int vec_enc) {
5112   int to_elem_sz = type2aelembytes(to_elem_bt);
5113   assert(to_elem_sz <= 4, "");
5114   vcvttps2dq(dst, src, vec_enc);
5115   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5116   switch(to_elem_bt) {
5117     case T_INT:
5118       break;
5119     case T_SHORT:
5120       evpmovdw(dst, dst, vec_enc);
5121       break;
5122     case T_BYTE:
5123       evpmovdb(dst, dst, vec_enc);
5124       break;
5125     default: assert(false, "Unexpected basic type for target of vector castF2X EVEX: %s", type2name(to_elem_bt));
5126   }
5127 }
5128 
5129 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5130                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5131                                             Register rscratch, int vec_enc) {
5132   evcvttps2qq(dst, src, vec_enc);
5133   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5134 }
5135 
5136 // Handling for downcasting from double to integer or sub-word types on AVX2.
5137 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5138                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5139                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5140   int to_elem_sz = type2aelembytes(to_elem_bt);
5141   assert(to_elem_sz < 8, "");
5142   vcvttpd2dq(dst, src, vec_enc);
5143   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5144                                               float_sign_flip, vec_enc);
5145   if (to_elem_sz < 4) {
5146     // xtmp4 holds all zero lanes.
5147     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5148   }
5149 }
5150 
5151 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5152                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5153                                             KRegister ktmp2, AddressLiteral sign_flip,
5154                                             Register rscratch, int vec_enc) {
5155   if (VM_Version::supports_avx512dq()) {
5156     evcvttpd2qq(dst, src, vec_enc);
5157     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5158     switch(to_elem_bt) {
5159       case T_LONG:
5160         break;
5161       case T_INT:
5162         evpmovsqd(dst, dst, vec_enc);
5163         break;
5164       case T_SHORT:
5165         evpmovsqd(dst, dst, vec_enc);
5166         evpmovdw(dst, dst, vec_enc);
5167         break;
5168       case T_BYTE:
5169         evpmovsqd(dst, dst, vec_enc);
5170         evpmovdb(dst, dst, vec_enc);
5171         break;
5172       default: assert(false, "Unexpected basic type for target of vector castD2X AVX512DQ EVEX: %s", type2name(to_elem_bt));
5173     }
5174   } else {
5175     assert(type2aelembytes(to_elem_bt) <= 4, "");
5176     vcvttpd2dq(dst, src, vec_enc);
5177     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5178     switch(to_elem_bt) {
5179       case T_INT:
5180         break;
5181       case T_SHORT:
5182         evpmovdw(dst, dst, vec_enc);
5183         break;
5184       case T_BYTE:
5185         evpmovdb(dst, dst, vec_enc);
5186         break;
5187       default: assert(false, "Unexpected basic type for target of vector castD2X EVEX: %s", type2name(to_elem_bt));
5188     }
5189   }
5190 }
5191 
5192 void C2_MacroAssembler::vector_castF2X_avx10(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5193   switch(to_elem_bt) {
5194     case T_LONG:
5195       evcvttps2qqs(dst, src, vec_enc);
5196       break;
5197     case T_INT:
5198       evcvttps2dqs(dst, src, vec_enc);
5199       break;
5200     case T_SHORT:
5201       evcvttps2dqs(dst, src, vec_enc);
5202       evpmovdw(dst, dst, vec_enc);
5203       break;
5204     case T_BYTE:
5205       evcvttps2dqs(dst, src, vec_enc);
5206       evpmovdb(dst, dst, vec_enc);
5207       break;
5208     default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (reg src): %s", type2name(to_elem_bt));
5209   }
5210 }
5211 
5212 void C2_MacroAssembler::vector_castF2X_avx10(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5213   switch(to_elem_bt) {
5214     case T_LONG:
5215       evcvttps2qqs(dst, src, vec_enc);
5216       break;
5217     case T_INT:
5218       evcvttps2dqs(dst, src, vec_enc);
5219       break;
5220     case T_SHORT:
5221       evcvttps2dqs(dst, src, vec_enc);
5222       evpmovdw(dst, dst, vec_enc);
5223       break;
5224     case T_BYTE:
5225       evcvttps2dqs(dst, src, vec_enc);
5226       evpmovdb(dst, dst, vec_enc);
5227       break;
5228     default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (mem src): %s", type2name(to_elem_bt));
5229   }
5230 }
5231 
5232 void C2_MacroAssembler::vector_castD2X_avx10(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5233   switch(to_elem_bt) {
5234     case T_LONG:
5235       evcvttpd2qqs(dst, src, vec_enc);
5236       break;
5237     case T_INT:
5238       evcvttpd2dqs(dst, src, vec_enc);
5239       break;
5240     case T_SHORT:
5241       evcvttpd2dqs(dst, src, vec_enc);
5242       evpmovdw(dst, dst, vec_enc);
5243       break;
5244     case T_BYTE:
5245       evcvttpd2dqs(dst, src, vec_enc);
5246       evpmovdb(dst, dst, vec_enc);
5247       break;
5248     default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (reg src): %s", type2name(to_elem_bt));
5249   }
5250 }
5251 
5252 void C2_MacroAssembler::vector_castD2X_avx10(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5253   switch(to_elem_bt) {
5254     case T_LONG:
5255       evcvttpd2qqs(dst, src, vec_enc);
5256       break;
5257     case T_INT:
5258       evcvttpd2dqs(dst, src, vec_enc);
5259       break;
5260     case T_SHORT:
5261       evcvttpd2dqs(dst, src, vec_enc);
5262       evpmovdw(dst, dst, vec_enc);
5263       break;
5264     case T_BYTE:
5265       evcvttpd2dqs(dst, src, vec_enc);
5266       evpmovdb(dst, dst, vec_enc);
5267       break;
5268     default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (mem src): %s", type2name(to_elem_bt));
5269   }
5270 }
5271 
5272 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5273                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5274                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5275   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5276   // and re-instantiate original MXCSR.RC mode after that.
5277   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5278 
5279   mov64(tmp, julong_cast(0.5L));
5280   evpbroadcastq(xtmp1, tmp, vec_enc);
5281   vaddpd(xtmp1, src , xtmp1, vec_enc);
5282   evcvtpd2qq(dst, xtmp1, vec_enc);
5283   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5284                                                 double_sign_flip, vec_enc);;
5285 
5286   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5287 }
5288 
5289 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5290                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5291                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5292   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5293   // and re-instantiate original MXCSR.RC mode after that.
5294   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5295 
5296   movl(tmp, jint_cast(0.5));
5297   movq(xtmp1, tmp);
5298   vbroadcastss(xtmp1, xtmp1, vec_enc);
5299   vaddps(xtmp1, src , xtmp1, vec_enc);
5300   vcvtps2dq(dst, xtmp1, vec_enc);
5301   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5302                                               float_sign_flip, vec_enc);
5303 
5304   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5305 }
5306 
5307 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5308                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5309                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5310   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5311   // and re-instantiate original MXCSR.RC mode after that.
5312   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5313 
5314   movl(tmp, jint_cast(0.5));
5315   movq(xtmp1, tmp);
5316   vbroadcastss(xtmp1, xtmp1, vec_enc);
5317   vaddps(xtmp1, src , xtmp1, vec_enc);
5318   vcvtps2dq(dst, xtmp1, vec_enc);
5319   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5320 
5321   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5322 }
5323 
5324 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5325                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5326   switch (from_elem_bt) {
5327     case T_BYTE:
5328       switch (to_elem_bt) {
5329         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5330         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5331         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5332         default: ShouldNotReachHere();
5333       }
5334       break;
5335     case T_SHORT:
5336       switch (to_elem_bt) {
5337         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5338         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5339         default: ShouldNotReachHere();
5340       }
5341       break;
5342     case T_INT:
5343       assert(to_elem_bt == T_LONG, "");
5344       vpmovzxdq(dst, src, vlen_enc);
5345       break;
5346     default:
5347       ShouldNotReachHere();
5348   }
5349 }
5350 
5351 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5352                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5353   switch (from_elem_bt) {
5354     case T_BYTE:
5355       switch (to_elem_bt) {
5356         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5357         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5358         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5359         default: ShouldNotReachHere();
5360       }
5361       break;
5362     case T_SHORT:
5363       switch (to_elem_bt) {
5364         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5365         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5366         default: ShouldNotReachHere();
5367       }
5368       break;
5369     case T_INT:
5370       assert(to_elem_bt == T_LONG, "");
5371       vpmovsxdq(dst, src, vlen_enc);
5372       break;
5373     default:
5374       ShouldNotReachHere();
5375   }
5376 }
5377 
5378 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5379                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5380   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5381   assert(vlen_enc != AVX_512bit, "");
5382 
5383   int dst_bt_size = type2aelembytes(dst_bt);
5384   int src_bt_size = type2aelembytes(src_bt);
5385   if (dst_bt_size > src_bt_size) {
5386     switch (dst_bt_size / src_bt_size) {
5387       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5388       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5389       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5390       default: ShouldNotReachHere();
5391     }
5392   } else {
5393     assert(dst_bt_size < src_bt_size, "");
5394     switch (src_bt_size / dst_bt_size) {
5395       case 2: {
5396         if (vlen_enc == AVX_128bit) {
5397           vpacksswb(dst, src, src, vlen_enc);
5398         } else {
5399           vpacksswb(dst, src, src, vlen_enc);
5400           vpermq(dst, dst, 0x08, vlen_enc);
5401         }
5402         break;
5403       }
5404       case 4: {
5405         if (vlen_enc == AVX_128bit) {
5406           vpackssdw(dst, src, src, vlen_enc);
5407           vpacksswb(dst, dst, dst, vlen_enc);
5408         } else {
5409           vpackssdw(dst, src, src, vlen_enc);
5410           vpermq(dst, dst, 0x08, vlen_enc);
5411           vpacksswb(dst, dst, dst, AVX_128bit);
5412         }
5413         break;
5414       }
5415       case 8: {
5416         if (vlen_enc == AVX_128bit) {
5417           vpshufd(dst, src, 0x08, vlen_enc);
5418           vpackssdw(dst, dst, dst, vlen_enc);
5419           vpacksswb(dst, dst, dst, vlen_enc);
5420         } else {
5421           vpshufd(dst, src, 0x08, vlen_enc);
5422           vpermq(dst, dst, 0x08, vlen_enc);
5423           vpackssdw(dst, dst, dst, AVX_128bit);
5424           vpacksswb(dst, dst, dst, AVX_128bit);
5425         }
5426         break;
5427       }
5428       default: ShouldNotReachHere();
5429     }
5430   }
5431 }
5432 
5433 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5434                                    bool merge, BasicType bt, int vlen_enc) {
5435   if (bt == T_INT) {
5436     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5437   } else {
5438     assert(bt == T_LONG, "");
5439     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5440   }
5441 }
5442 
5443 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5444                                    bool merge, BasicType bt, int vlen_enc) {
5445   if (bt == T_INT) {
5446     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5447   } else {
5448     assert(bt == T_LONG, "");
5449     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5450   }
5451 }
5452 
5453 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5454                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5455                                                int vec_enc) {
5456   int index = 0;
5457   int vindex = 0;
5458   mov64(rtmp1, 0x0101010101010101L);
5459   pdepq(rtmp1, src, rtmp1);
5460   if (mask_len > 8) {
5461     movq(rtmp2, src);
5462     vpxor(xtmp, xtmp, xtmp, vec_enc);
5463     movq(xtmp, rtmp1);
5464   }
5465   movq(dst, rtmp1);
5466 
5467   mask_len -= 8;
5468   while (mask_len > 0) {
5469     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5470     index++;
5471     if ((index % 2) == 0) {
5472       pxor(xtmp, xtmp);
5473     }
5474     mov64(rtmp1, 0x0101010101010101L);
5475     shrq(rtmp2, 8);
5476     pdepq(rtmp1, rtmp2, rtmp1);
5477     pinsrq(xtmp, rtmp1, index % 2);
5478     vindex = index / 2;
5479     if (vindex) {
5480       // Write entire 16 byte vector when both 64 bit
5481       // lanes are update to save redundant instructions.
5482       if (index % 2) {
5483         vinsertf128(dst, dst, xtmp, vindex);
5484       }
5485     } else {
5486       vmovdqu(dst, xtmp);
5487     }
5488     mask_len -= 8;
5489   }
5490 }
5491 
5492 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5493   switch(opc) {
5494     case Op_VectorMaskTrueCount:
5495       popcntq(dst, tmp);
5496       break;
5497     case Op_VectorMaskLastTrue:
5498       if (VM_Version::supports_lzcnt()) {
5499         lzcntq(tmp, tmp);
5500         movl(dst, 63);
5501         subl(dst, tmp);
5502       } else {
5503         movl(dst, -1);
5504         bsrq(tmp, tmp);
5505         cmov32(Assembler::notZero, dst, tmp);
5506       }
5507       break;
5508     case Op_VectorMaskFirstTrue:
5509       if (VM_Version::supports_bmi1()) {
5510         if (masklen < 32) {
5511           orl(tmp, 1 << masklen);
5512           tzcntl(dst, tmp);
5513         } else if (masklen == 32) {
5514           tzcntl(dst, tmp);
5515         } else {
5516           assert(masklen == 64, "");
5517           tzcntq(dst, tmp);
5518         }
5519       } else {
5520         if (masklen < 32) {
5521           orl(tmp, 1 << masklen);
5522           bsfl(dst, tmp);
5523         } else {
5524           assert(masklen == 32 || masklen == 64, "");
5525           movl(dst, masklen);
5526           if (masklen == 32)  {
5527             bsfl(tmp, tmp);
5528           } else {
5529             bsfq(tmp, tmp);
5530           }
5531           cmov32(Assembler::notZero, dst, tmp);
5532         }
5533       }
5534       break;
5535     case Op_VectorMaskToLong:
5536       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5537       break;
5538     default: assert(false, "Unhandled mask operation");
5539   }
5540 }
5541 
5542 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5543                                               int masklen, int masksize, int vec_enc) {
5544   assert(VM_Version::supports_popcnt(), "");
5545 
5546   if(VM_Version::supports_avx512bw()) {
5547     kmovql(tmp, mask);
5548   } else {
5549     assert(masklen <= 16, "");
5550     kmovwl(tmp, mask);
5551   }
5552 
5553   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5554   // operations needs to be clipped.
5555   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5556     andq(tmp, (1 << masklen) - 1);
5557   }
5558 
5559   vector_mask_operation_helper(opc, dst, tmp, masklen);
5560 }
5561 
5562 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5563                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5564   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5565          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5566   assert(VM_Version::supports_popcnt(), "");
5567 
5568   bool need_clip = false;
5569   switch(bt) {
5570     case T_BOOLEAN:
5571       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5572       vpxor(xtmp, xtmp, xtmp, vec_enc);
5573       vpsubb(xtmp, xtmp, mask, vec_enc);
5574       vpmovmskb(tmp, xtmp, vec_enc);
5575       need_clip = masklen < 16;
5576       break;
5577     case T_BYTE:
5578       vpmovmskb(tmp, mask, vec_enc);
5579       need_clip = masklen < 16;
5580       break;
5581     case T_SHORT:
5582       vpacksswb(xtmp, mask, mask, vec_enc);
5583       if (masklen >= 16) {
5584         vpermpd(xtmp, xtmp, 8, vec_enc);
5585       }
5586       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5587       need_clip = masklen < 16;
5588       break;
5589     case T_INT:
5590     case T_FLOAT:
5591       vmovmskps(tmp, mask, vec_enc);
5592       need_clip = masklen < 4;
5593       break;
5594     case T_LONG:
5595     case T_DOUBLE:
5596       vmovmskpd(tmp, mask, vec_enc);
5597       need_clip = masklen < 2;
5598       break;
5599     default: assert(false, "Unhandled type, %s", type2name(bt));
5600   }
5601 
5602   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5603   // operations needs to be clipped.
5604   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5605     // need_clip implies masklen < 32
5606     andq(tmp, (1 << masklen) - 1);
5607   }
5608 
5609   vector_mask_operation_helper(opc, dst, tmp, masklen);
5610 }
5611 
5612 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5613                                              Register rtmp2, int mask_len) {
5614   kmov(rtmp1, src);
5615   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5616   mov64(rtmp2, -1L);
5617   pextq(rtmp2, rtmp2, rtmp1);
5618   kmov(dst, rtmp2);
5619 }
5620 
5621 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5622                                                     XMMRegister mask, Register rtmp, Register rscratch,
5623                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5624                                                     int vec_enc) {
5625   assert(type2aelembytes(bt) >= 4, "");
5626   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5627   address compress_perm_table = nullptr;
5628   address expand_perm_table = nullptr;
5629   if (type2aelembytes(bt) == 8) {
5630     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5631     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5632     vmovmskpd(rtmp, mask, vec_enc);
5633   } else {
5634     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5635     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5636     vmovmskps(rtmp, mask, vec_enc);
5637   }
5638   shlq(rtmp, 5); // for 32 byte permute row.
5639   if (opcode == Op_CompressV) {
5640     lea(rscratch, ExternalAddress(compress_perm_table));
5641   } else {
5642     lea(rscratch, ExternalAddress(expand_perm_table));
5643   }
5644   addptr(rtmp, rscratch);
5645   vmovdqu(permv, Address(rtmp));
5646   vpermps(dst, permv, src, Assembler::AVX_256bit);
5647   vpxor(xtmp, xtmp, xtmp, vec_enc);
5648   // Blend the result with zero vector using permute mask, each column entry
5649   // in a permute table row contains either a valid permute index or a -1 (default)
5650   // value, this can potentially be used as a blending mask after
5651   // compressing/expanding the source vector lanes.
5652   vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv);
5653 }
5654 
5655 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5656                                                bool merge, BasicType bt, int vec_enc) {
5657   if (opcode == Op_CompressV) {
5658     switch(bt) {
5659     case T_BYTE:
5660       evpcompressb(dst, mask, src, merge, vec_enc);
5661       break;
5662     case T_CHAR:
5663     case T_SHORT:
5664       evpcompressw(dst, mask, src, merge, vec_enc);
5665       break;
5666     case T_INT:
5667       evpcompressd(dst, mask, src, merge, vec_enc);
5668       break;
5669     case T_FLOAT:
5670       evcompressps(dst, mask, src, merge, vec_enc);
5671       break;
5672     case T_LONG:
5673       evpcompressq(dst, mask, src, merge, vec_enc);
5674       break;
5675     case T_DOUBLE:
5676       evcompresspd(dst, mask, src, merge, vec_enc);
5677       break;
5678     default:
5679       fatal("Unsupported type %s", type2name(bt));
5680       break;
5681     }
5682   } else {
5683     assert(opcode == Op_ExpandV, "");
5684     switch(bt) {
5685     case T_BYTE:
5686       evpexpandb(dst, mask, src, merge, vec_enc);
5687       break;
5688     case T_CHAR:
5689     case T_SHORT:
5690       evpexpandw(dst, mask, src, merge, vec_enc);
5691       break;
5692     case T_INT:
5693       evpexpandd(dst, mask, src, merge, vec_enc);
5694       break;
5695     case T_FLOAT:
5696       evexpandps(dst, mask, src, merge, vec_enc);
5697       break;
5698     case T_LONG:
5699       evpexpandq(dst, mask, src, merge, vec_enc);
5700       break;
5701     case T_DOUBLE:
5702       evexpandpd(dst, mask, src, merge, vec_enc);
5703       break;
5704     default:
5705       fatal("Unsupported type %s", type2name(bt));
5706       break;
5707     }
5708   }
5709 }
5710 
5711 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5712                                            KRegister ktmp1, int vec_enc) {
5713   if (opcode == Op_SignumVD) {
5714     vsubpd(dst, zero, one, vec_enc);
5715     // if src < 0 ? -1 : 1
5716     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5717     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5718     // if src == NaN, -0.0 or 0.0 return src.
5719     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5720     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5721   } else {
5722     assert(opcode == Op_SignumVF, "");
5723     vsubps(dst, zero, one, vec_enc);
5724     // if src < 0 ? -1 : 1
5725     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5726     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5727     // if src == NaN, -0.0 or 0.0 return src.
5728     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5729     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5730   }
5731 }
5732 
5733 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5734                                           XMMRegister xtmp1, int vec_enc) {
5735   if (opcode == Op_SignumVD) {
5736     vsubpd(dst, zero, one, vec_enc);
5737     // if src < 0 ? -1 : 1
5738     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5739     // if src == NaN, -0.0 or 0.0 return src.
5740     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5741     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5742   } else {
5743     assert(opcode == Op_SignumVF, "");
5744     vsubps(dst, zero, one, vec_enc);
5745     // if src < 0 ? -1 : 1
5746     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5747     // if src == NaN, -0.0 or 0.0 return src.
5748     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5749     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5750   }
5751 }
5752 
5753 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5754   if (VM_Version::supports_avx512bw()) {
5755     if (mask_len > 32) {
5756       kmovql(dst, src);
5757     } else {
5758       kmovdl(dst, src);
5759       if (mask_len != 32) {
5760         kshiftrdl(dst, dst, 32 - mask_len);
5761       }
5762     }
5763   } else {
5764     assert(mask_len <= 16, "");
5765     kmovwl(dst, src);
5766     if (mask_len != 16) {
5767       kshiftrwl(dst, dst, 16 - mask_len);
5768     }
5769   }
5770 }
5771 
5772 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5773   int lane_size = type2aelembytes(bt);
5774   if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5775       (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) {
5776     movptr(rtmp, imm32);
5777     switch(lane_size) {
5778       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5779       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5780       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5781       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5782       fatal("Unsupported lane size %d", lane_size);
5783       break;
5784     }
5785   } else {
5786     movptr(rtmp, imm32);
5787     movq(dst, rtmp);
5788     switch(lane_size) {
5789       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5790       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5791       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5792       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5793       fatal("Unsupported lane size %d", lane_size);
5794       break;
5795     }
5796   }
5797 }
5798 
5799 //
5800 // Following is lookup table based popcount computation algorithm:-
5801 //       Index   Bit set count
5802 //     [ 0000 ->   0,
5803 //       0001 ->   1,
5804 //       0010 ->   1,
5805 //       0011 ->   2,
5806 //       0100 ->   1,
5807 //       0101 ->   2,
5808 //       0110 ->   2,
5809 //       0111 ->   3,
5810 //       1000 ->   1,
5811 //       1001 ->   2,
5812 //       1010 ->   3,
5813 //       1011 ->   3,
5814 //       1100 ->   2,
5815 //       1101 ->   3,
5816 //       1111 ->   4 ]
5817 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5818 //     shuffle indices for lookup table access.
5819 //  b. Right shift each byte of vector lane by 4 positions.
5820 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5821 //     shuffle indices for lookup table access.
5822 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5823 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5824 //     count of all the bytes of a quadword.
5825 //  f. Perform step e. for upper 128bit vector lane.
5826 //  g. Pack the bitset count of quadwords back to double word.
5827 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5828 
5829 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5830                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5831   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5832   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5833   vpsrlw(dst, src, 4, vec_enc);
5834   vpand(dst, dst, xtmp1, vec_enc);
5835   vpand(xtmp1, src, xtmp1, vec_enc);
5836   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5837   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5838   vpshufb(dst, xtmp2, dst, vec_enc);
5839   vpaddb(dst, dst, xtmp1, vec_enc);
5840 }
5841 
5842 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5843                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5844   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5845   // Following code is as per steps e,f,g and h of above algorithm.
5846   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5847   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5848   vpsadbw(dst, dst, xtmp2, vec_enc);
5849   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5850   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5851   vpackuswb(dst, xtmp1, dst, vec_enc);
5852 }
5853 
5854 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5855                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5856   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5857   // Add the popcount of upper and lower bytes of word.
5858   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5859   vpsrlw(dst, xtmp1, 8, vec_enc);
5860   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5861   vpaddw(dst, dst, xtmp1, vec_enc);
5862 }
5863 
5864 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5865                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5866   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5867   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5868   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5869 }
5870 
5871 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5872                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5873   switch(bt) {
5874     case T_LONG:
5875       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5876       break;
5877     case T_INT:
5878       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5879       break;
5880     case T_CHAR:
5881     case T_SHORT:
5882       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5883       break;
5884     case T_BYTE:
5885     case T_BOOLEAN:
5886       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5887       break;
5888     default:
5889       fatal("Unsupported type %s", type2name(bt));
5890       break;
5891   }
5892 }
5893 
5894 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5895                                                       KRegister mask, bool merge, int vec_enc) {
5896   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5897   switch(bt) {
5898     case T_LONG:
5899       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5900       evpopcntq(dst, mask, src, merge, vec_enc);
5901       break;
5902     case T_INT:
5903       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5904       evpopcntd(dst, mask, src, merge, vec_enc);
5905       break;
5906     case T_CHAR:
5907     case T_SHORT:
5908       assert(VM_Version::supports_avx512_bitalg(), "");
5909       evpopcntw(dst, mask, src, merge, vec_enc);
5910       break;
5911     case T_BYTE:
5912     case T_BOOLEAN:
5913       assert(VM_Version::supports_avx512_bitalg(), "");
5914       evpopcntb(dst, mask, src, merge, vec_enc);
5915       break;
5916     default:
5917       fatal("Unsupported type %s", type2name(bt));
5918       break;
5919   }
5920 }
5921 
5922 // Bit reversal algorithm first reverses the bits of each byte followed by
5923 // a byte level reversal for multi-byte primitive types (short/int/long).
5924 // Algorithm performs a lookup table access to get reverse bit sequence
5925 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5926 // is obtained by swapping the reverse bit sequences of upper and lower
5927 // nibble of a byte.
5928 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5929                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5930   if (VM_Version::supports_avx512vlbw()) {
5931 
5932     // Get the reverse bit sequence of lower nibble of each byte.
5933     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5934     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5935     evpandq(dst, xtmp2, src, vec_enc);
5936     vpshufb(dst, xtmp1, dst, vec_enc);
5937     vpsllq(dst, dst, 4, vec_enc);
5938 
5939     // Get the reverse bit sequence of upper nibble of each byte.
5940     vpandn(xtmp2, xtmp2, src, vec_enc);
5941     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5942     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5943 
5944     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5945     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5946     evporq(xtmp2, dst, xtmp2, vec_enc);
5947     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5948 
5949   } else if(vec_enc == Assembler::AVX_512bit) {
5950     // Shift based bit reversal.
5951     assert(bt == T_LONG || bt == T_INT, "");
5952 
5953     // Swap lower and upper nibble of each byte.
5954     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5955 
5956     // Swap two least and most significant bits of each nibble.
5957     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5958 
5959     // Swap adjacent pair of bits.
5960     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5961     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5962 
5963     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5964     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5965   } else {
5966     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5967     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5968 
5969     // Get the reverse bit sequence of lower nibble of each byte.
5970     vpand(dst, xtmp2, src, vec_enc);
5971     vpshufb(dst, xtmp1, dst, vec_enc);
5972     vpsllq(dst, dst, 4, vec_enc);
5973 
5974     // Get the reverse bit sequence of upper nibble of each byte.
5975     vpandn(xtmp2, xtmp2, src, vec_enc);
5976     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5977     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5978 
5979     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5980     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5981     vpor(xtmp2, dst, xtmp2, vec_enc);
5982     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5983   }
5984 }
5985 
5986 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5987                                                 XMMRegister xtmp, Register rscratch) {
5988   assert(VM_Version::supports_gfni(), "");
5989   assert(rscratch != noreg || always_reachable(mask), "missing");
5990 
5991   // Galois field instruction based bit reversal based on following algorithm.
5992   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5993   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5994   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5995   vector_reverse_byte(bt, dst, xtmp, vec_enc);
5996 }
5997 
5998 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5999                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
6000   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
6001   evpandq(dst, xtmp1, src, vec_enc);
6002   vpsllq(dst, dst, nbits, vec_enc);
6003   vpandn(xtmp1, xtmp1, src, vec_enc);
6004   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
6005   evporq(dst, dst, xtmp1, vec_enc);
6006 }
6007 
6008 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6009                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
6010   // Shift based bit reversal.
6011   assert(VM_Version::supports_evex(), "");
6012   switch(bt) {
6013     case T_LONG:
6014       // Swap upper and lower double word of each quad word.
6015       evprorq(xtmp1, k0, src, 32, true, vec_enc);
6016       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
6017       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6018       break;
6019     case T_INT:
6020       // Swap upper and lower word of each double word.
6021       evprord(xtmp1, k0, src, 16, true, vec_enc);
6022       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6023       break;
6024     case T_CHAR:
6025     case T_SHORT:
6026       // Swap upper and lower byte of each word.
6027       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6028       break;
6029     case T_BYTE:
6030       evmovdquq(dst, k0, src, true, vec_enc);
6031       break;
6032     default:
6033       fatal("Unsupported type %s", type2name(bt));
6034       break;
6035   }
6036 }
6037 
6038 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6039   if (bt == T_BYTE) {
6040     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6041       evmovdquq(dst, k0, src, true, vec_enc);
6042     } else {
6043       vmovdqu(dst, src);
6044     }
6045     return;
6046   }
6047   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6048   // pre-computed shuffle indices.
6049   switch(bt) {
6050     case T_LONG:
6051       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6052       break;
6053     case T_INT:
6054       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6055       break;
6056     case T_CHAR:
6057     case T_SHORT:
6058       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6059       break;
6060     default:
6061       fatal("Unsupported type %s", type2name(bt));
6062       break;
6063   }
6064   vpshufb(dst, src, dst, vec_enc);
6065 }
6066 
6067 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6068                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6069                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6070   assert(is_integral_type(bt), "");
6071   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6072   assert(VM_Version::supports_avx512cd(), "");
6073   switch(bt) {
6074     case T_LONG:
6075       evplzcntq(dst, ktmp, src, merge, vec_enc);
6076       break;
6077     case T_INT:
6078       evplzcntd(dst, ktmp, src, merge, vec_enc);
6079       break;
6080     case T_SHORT:
6081       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6082       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6083       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6084       vpunpckhwd(dst, xtmp1, src, vec_enc);
6085       evplzcntd(dst, ktmp, dst, merge, vec_enc);
6086       vpackusdw(dst, xtmp2, dst, vec_enc);
6087       break;
6088     case T_BYTE:
6089       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6090       // accessing the lookup table.
6091       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6092       // accessing the lookup table.
6093       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6094       assert(VM_Version::supports_avx512bw(), "");
6095       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6096       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6097       vpand(xtmp2, dst, src, vec_enc);
6098       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6099       vpsrlw(xtmp3, src, 4, vec_enc);
6100       vpand(xtmp3, dst, xtmp3, vec_enc);
6101       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6102       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6103       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6104       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6105       break;
6106     default:
6107       fatal("Unsupported type %s", type2name(bt));
6108       break;
6109   }
6110 }
6111 
6112 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6113                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6114   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6115   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6116   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6117   // accessing the lookup table.
6118   vpand(dst, xtmp2, src, vec_enc);
6119   vpshufb(dst, xtmp1, dst, vec_enc);
6120   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6121   // accessing the lookup table.
6122   vpsrlw(xtmp3, src, 4, vec_enc);
6123   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6124   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6125   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6126   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6127   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6128   vpaddb(dst, dst, xtmp2, vec_enc);
6129   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6130 }
6131 
6132 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6133                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6134   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6135   // Add zero counts of lower byte and upper byte of a word if
6136   // upper byte holds a zero value.
6137   vpsrlw(xtmp3, src, 8, vec_enc);
6138   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6139   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6140   vpsllw(xtmp2, dst, 8, vec_enc);
6141   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6142   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6143   vpsrlw(dst, dst, 8, vec_enc);
6144 }
6145 
6146 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6147                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6148   // Since IEEE 754 floating point format represents mantissa in 1.0 format
6149   // hence biased exponent can be used to compute leading zero count as per
6150   // following formula:-
6151   // LZCNT = 31 - (biased_exp - 127)
6152   // Special handling has been introduced for Zero, Max_Int and -ve source values.
6153 
6154   // Broadcast 0xFF
6155   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6156   vpsrld(xtmp1, xtmp1, 24, vec_enc);
6157 
6158   // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher
6159   // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit
6160   // contributes to the leading number of zeros.
6161   vpsrld(xtmp2, src, 1, vec_enc);
6162   vpandn(xtmp3, xtmp2, src, vec_enc);
6163 
6164   // Extract biased exponent.
6165   vcvtdq2ps(dst, xtmp3, vec_enc);
6166   vpsrld(dst, dst, 23, vec_enc);
6167   vpand(dst, dst, xtmp1, vec_enc);
6168 
6169   // Broadcast 127.
6170   vpsrld(xtmp1, xtmp1, 1, vec_enc);
6171   // Exponent = biased_exp - 127
6172   vpsubd(dst, dst, xtmp1, vec_enc);
6173 
6174   // Exponent_plus_one = Exponent + 1
6175   vpsrld(xtmp3, xtmp1, 6, vec_enc);
6176   vpaddd(dst, dst, xtmp3, vec_enc);
6177 
6178   // Replace -ve exponent with zero, exponent is -ve when src
6179   // lane contains a zero value.
6180   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6181   vblendvps(dst, dst, xtmp2, dst, vec_enc);
6182 
6183   // Rematerialize broadcast 32.
6184   vpslld(xtmp1, xtmp3, 5, vec_enc);
6185   // Exponent is 32 if corresponding source lane contains max_int value.
6186   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
6187   // LZCNT = 32 - exponent_plus_one
6188   vpsubd(dst, xtmp1, dst, vec_enc);
6189 
6190   // Replace LZCNT with a value 1 if corresponding source lane
6191   // contains max_int value.
6192   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
6193 
6194   // Replace biased_exp with 0 if source lane value is less than zero.
6195   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6196   vblendvps(dst, dst, xtmp2, src, vec_enc);
6197 }
6198 
6199 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6200                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6201   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6202   // Add zero counts of lower word and upper word of a double word if
6203   // upper word holds a zero value.
6204   vpsrld(xtmp3, src, 16, vec_enc);
6205   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6206   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
6207   vpslld(xtmp2, dst, 16, vec_enc);
6208   vpaddd(xtmp2, xtmp2, dst, vec_enc);
6209   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6210   vpsrld(dst, dst, 16, vec_enc);
6211   // Add zero counts of lower doubleword and upper doubleword of a
6212   // quadword if upper doubleword holds a zero value.
6213   vpsrlq(xtmp3, src, 32, vec_enc);
6214   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
6215   vpsllq(xtmp2, dst, 32, vec_enc);
6216   vpaddq(xtmp2, xtmp2, dst, vec_enc);
6217   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6218   vpsrlq(dst, dst, 32, vec_enc);
6219 }
6220 
6221 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6222                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6223                                                        Register rtmp, int vec_enc) {
6224   assert(is_integral_type(bt), "unexpected type");
6225   assert(vec_enc < Assembler::AVX_512bit, "");
6226   switch(bt) {
6227     case T_LONG:
6228       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6229       break;
6230     case T_INT:
6231       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6232       break;
6233     case T_SHORT:
6234       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6235       break;
6236     case T_BYTE:
6237       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6238       break;
6239     default:
6240       fatal("Unsupported type %s", type2name(bt));
6241       break;
6242   }
6243 }
6244 
6245 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6246   switch(bt) {
6247     case T_BYTE:
6248       vpsubb(dst, src1, src2, vec_enc);
6249       break;
6250     case T_SHORT:
6251       vpsubw(dst, src1, src2, vec_enc);
6252       break;
6253     case T_INT:
6254       vpsubd(dst, src1, src2, vec_enc);
6255       break;
6256     case T_LONG:
6257       vpsubq(dst, src1, src2, vec_enc);
6258       break;
6259     default:
6260       fatal("Unsupported type %s", type2name(bt));
6261       break;
6262   }
6263 }
6264 
6265 // Trailing zero count computation is based on leading zero count operation as per
6266 // following equation. All AVX3 targets support AVX512CD feature which offers
6267 // direct vector instruction to compute leading zero count.
6268 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6269 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6270                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6271                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6272   assert(is_integral_type(bt), "");
6273   // xtmp = -1
6274   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6275   // xtmp = xtmp + src
6276   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6277   // xtmp = xtmp & ~src
6278   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6279   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6280   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6281   vpsub(bt, dst, xtmp4, dst, vec_enc);
6282 }
6283 
6284 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6285 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6286 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6287                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6288   assert(is_integral_type(bt), "");
6289   // xtmp = 0
6290   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6291   // xtmp = 0 - src
6292   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6293   // xtmp = xtmp | src
6294   vpor(xtmp3, xtmp3, src, vec_enc);
6295   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6296   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6297   vpsub(bt, dst, xtmp1, dst, vec_enc);
6298 }
6299 
6300 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6301   Label done;
6302   Label neg_divisor_fastpath;
6303   cmpl(divisor, 0);
6304   jccb(Assembler::less, neg_divisor_fastpath);
6305   xorl(rdx, rdx);
6306   divl(divisor);
6307   jmpb(done);
6308   bind(neg_divisor_fastpath);
6309   // Fastpath for divisor < 0:
6310   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6311   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6312   movl(rdx, rax);
6313   subl(rdx, divisor);
6314   if (VM_Version::supports_bmi1()) {
6315     andnl(rax, rdx, rax);
6316   } else {
6317     notl(rdx);
6318     andl(rax, rdx);
6319   }
6320   shrl(rax, 31);
6321   bind(done);
6322 }
6323 
6324 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6325   Label done;
6326   Label neg_divisor_fastpath;
6327   cmpl(divisor, 0);
6328   jccb(Assembler::less, neg_divisor_fastpath);
6329   xorl(rdx, rdx);
6330   divl(divisor);
6331   jmpb(done);
6332   bind(neg_divisor_fastpath);
6333   // Fastpath when divisor < 0:
6334   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6335   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6336   movl(rdx, rax);
6337   subl(rax, divisor);
6338   if (VM_Version::supports_bmi1()) {
6339     andnl(rax, rax, rdx);
6340   } else {
6341     notl(rax);
6342     andl(rax, rdx);
6343   }
6344   sarl(rax, 31);
6345   andl(rax, divisor);
6346   subl(rdx, rax);
6347   bind(done);
6348 }
6349 
6350 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6351   Label done;
6352   Label neg_divisor_fastpath;
6353 
6354   cmpl(divisor, 0);
6355   jccb(Assembler::less, neg_divisor_fastpath);
6356   xorl(rdx, rdx);
6357   divl(divisor);
6358   jmpb(done);
6359   bind(neg_divisor_fastpath);
6360   // Fastpath for divisor < 0:
6361   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6362   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6363   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6364   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6365   movl(rdx, rax);
6366   subl(rax, divisor);
6367   if (VM_Version::supports_bmi1()) {
6368     andnl(rax, rax, rdx);
6369   } else {
6370     notl(rax);
6371     andl(rax, rdx);
6372   }
6373   movl(tmp, rax);
6374   shrl(rax, 31); // quotient
6375   sarl(tmp, 31);
6376   andl(tmp, divisor);
6377   subl(rdx, tmp); // remainder
6378   bind(done);
6379 }
6380 
6381 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6382                                  XMMRegister xtmp2, Register rtmp) {
6383   if(VM_Version::supports_gfni()) {
6384     // Galois field instruction based bit reversal based on following algorithm.
6385     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6386     mov64(rtmp, 0x8040201008040201L);
6387     movq(xtmp1, src);
6388     movq(xtmp2, rtmp);
6389     gf2p8affineqb(xtmp1, xtmp2, 0);
6390     movq(dst, xtmp1);
6391   } else {
6392     // Swap even and odd numbered bits.
6393     movl(rtmp, src);
6394     andl(rtmp, 0x55555555);
6395     shll(rtmp, 1);
6396     movl(dst, src);
6397     andl(dst, 0xAAAAAAAA);
6398     shrl(dst, 1);
6399     orl(dst, rtmp);
6400 
6401     // Swap LSB and MSB 2 bits of each nibble.
6402     movl(rtmp, dst);
6403     andl(rtmp, 0x33333333);
6404     shll(rtmp, 2);
6405     andl(dst, 0xCCCCCCCC);
6406     shrl(dst, 2);
6407     orl(dst, rtmp);
6408 
6409     // Swap LSB and MSB 4 bits of each byte.
6410     movl(rtmp, dst);
6411     andl(rtmp, 0x0F0F0F0F);
6412     shll(rtmp, 4);
6413     andl(dst, 0xF0F0F0F0);
6414     shrl(dst, 4);
6415     orl(dst, rtmp);
6416   }
6417   bswapl(dst);
6418 }
6419 
6420 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6421                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6422   if(VM_Version::supports_gfni()) {
6423     // Galois field instruction based bit reversal based on following algorithm.
6424     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6425     mov64(rtmp1, 0x8040201008040201L);
6426     movq(xtmp1, src);
6427     movq(xtmp2, rtmp1);
6428     gf2p8affineqb(xtmp1, xtmp2, 0);
6429     movq(dst, xtmp1);
6430   } else {
6431     // Swap even and odd numbered bits.
6432     movq(rtmp1, src);
6433     mov64(rtmp2, 0x5555555555555555L);
6434     andq(rtmp1, rtmp2);
6435     shlq(rtmp1, 1);
6436     movq(dst, src);
6437     notq(rtmp2);
6438     andq(dst, rtmp2);
6439     shrq(dst, 1);
6440     orq(dst, rtmp1);
6441 
6442     // Swap LSB and MSB 2 bits of each nibble.
6443     movq(rtmp1, dst);
6444     mov64(rtmp2, 0x3333333333333333L);
6445     andq(rtmp1, rtmp2);
6446     shlq(rtmp1, 2);
6447     notq(rtmp2);
6448     andq(dst, rtmp2);
6449     shrq(dst, 2);
6450     orq(dst, rtmp1);
6451 
6452     // Swap LSB and MSB 4 bits of each byte.
6453     movq(rtmp1, dst);
6454     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6455     andq(rtmp1, rtmp2);
6456     shlq(rtmp1, 4);
6457     notq(rtmp2);
6458     andq(dst, rtmp2);
6459     shrq(dst, 4);
6460     orq(dst, rtmp1);
6461   }
6462   bswapq(dst);
6463 }
6464 
6465 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6466   Label done;
6467   Label neg_divisor_fastpath;
6468   cmpq(divisor, 0);
6469   jccb(Assembler::less, neg_divisor_fastpath);
6470   xorl(rdx, rdx);
6471   divq(divisor);
6472   jmpb(done);
6473   bind(neg_divisor_fastpath);
6474   // Fastpath for divisor < 0:
6475   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6476   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6477   movq(rdx, rax);
6478   subq(rdx, divisor);
6479   if (VM_Version::supports_bmi1()) {
6480     andnq(rax, rdx, rax);
6481   } else {
6482     notq(rdx);
6483     andq(rax, rdx);
6484   }
6485   shrq(rax, 63);
6486   bind(done);
6487 }
6488 
6489 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6490   Label done;
6491   Label neg_divisor_fastpath;
6492   cmpq(divisor, 0);
6493   jccb(Assembler::less, neg_divisor_fastpath);
6494   xorq(rdx, rdx);
6495   divq(divisor);
6496   jmp(done);
6497   bind(neg_divisor_fastpath);
6498   // Fastpath when divisor < 0:
6499   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6500   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6501   movq(rdx, rax);
6502   subq(rax, divisor);
6503   if (VM_Version::supports_bmi1()) {
6504     andnq(rax, rax, rdx);
6505   } else {
6506     notq(rax);
6507     andq(rax, rdx);
6508   }
6509   sarq(rax, 63);
6510   andq(rax, divisor);
6511   subq(rdx, rax);
6512   bind(done);
6513 }
6514 
6515 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6516   Label done;
6517   Label neg_divisor_fastpath;
6518   cmpq(divisor, 0);
6519   jccb(Assembler::less, neg_divisor_fastpath);
6520   xorq(rdx, rdx);
6521   divq(divisor);
6522   jmp(done);
6523   bind(neg_divisor_fastpath);
6524   // Fastpath for divisor < 0:
6525   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6526   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6527   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6528   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6529   movq(rdx, rax);
6530   subq(rax, divisor);
6531   if (VM_Version::supports_bmi1()) {
6532     andnq(rax, rax, rdx);
6533   } else {
6534     notq(rax);
6535     andq(rax, rdx);
6536   }
6537   movq(tmp, rax);
6538   shrq(rax, 63); // quotient
6539   sarq(tmp, 63);
6540   andq(tmp, divisor);
6541   subq(rdx, tmp); // remainder
6542   bind(done);
6543 }
6544 
6545 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6546                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6547                                         int vlen_enc) {
6548   assert(VM_Version::supports_avx512bw(), "");
6549   // Byte shuffles are inlane operations and indices are determined using
6550   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6551   // normalized to index range 0-15. This makes sure that all the multiples
6552   // of an index value are placed at same relative position in 128 bit
6553   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6554   // will be 16th element in their respective 128 bit lanes.
6555   movl(rtmp, 16);
6556   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6557 
6558   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6559   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6560   // original shuffle indices and move the shuffled lanes corresponding to true
6561   // mask to destination vector.
6562   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6563   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6564   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6565 
6566   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6567   // and broadcasting second 128 bit lane.
6568   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6569   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6570   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6571   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6572   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6573 
6574   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6575   // and broadcasting third 128 bit lane.
6576   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6577   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6578   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6579   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6580   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6581 
6582   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6583   // and broadcasting third 128 bit lane.
6584   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6585   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6586   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6587   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6588   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6589 }
6590 
6591 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6592                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6593   if (vlen_enc == AVX_128bit) {
6594     vpermilps(dst, src, shuffle, vlen_enc);
6595   } else if (bt == T_INT) {
6596     vpermd(dst, shuffle, src, vlen_enc);
6597   } else {
6598     assert(bt == T_FLOAT, "");
6599     vpermps(dst, shuffle, src, vlen_enc);
6600   }
6601 }
6602 
6603 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6604   switch(opcode) {
6605     case Op_AddHF: vaddsh(dst, src1, src2); break;
6606     case Op_SubHF: vsubsh(dst, src1, src2); break;
6607     case Op_MulHF: vmulsh(dst, src1, src2); break;
6608     case Op_DivHF: vdivsh(dst, src1, src2); break;
6609     default: assert(false, "%s", NodeClassNames[opcode]); break;
6610   }
6611 }
6612 
6613 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6614   switch(elem_bt) {
6615     case T_BYTE:
6616       if (ideal_opc == Op_SaturatingAddV) {
6617         vpaddsb(dst, src1, src2, vlen_enc);
6618       } else {
6619         assert(ideal_opc == Op_SaturatingSubV, "");
6620         vpsubsb(dst, src1, src2, vlen_enc);
6621       }
6622       break;
6623     case T_SHORT:
6624       if (ideal_opc == Op_SaturatingAddV) {
6625         vpaddsw(dst, src1, src2, vlen_enc);
6626       } else {
6627         assert(ideal_opc == Op_SaturatingSubV, "");
6628         vpsubsw(dst, src1, src2, vlen_enc);
6629       }
6630       break;
6631     default:
6632       fatal("Unsupported type %s", type2name(elem_bt));
6633       break;
6634   }
6635 }
6636 
6637 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6638   switch(elem_bt) {
6639     case T_BYTE:
6640       if (ideal_opc == Op_SaturatingAddV) {
6641         vpaddusb(dst, src1, src2, vlen_enc);
6642       } else {
6643         assert(ideal_opc == Op_SaturatingSubV, "");
6644         vpsubusb(dst, src1, src2, vlen_enc);
6645       }
6646       break;
6647     case T_SHORT:
6648       if (ideal_opc == Op_SaturatingAddV) {
6649         vpaddusw(dst, src1, src2, vlen_enc);
6650       } else {
6651         assert(ideal_opc == Op_SaturatingSubV, "");
6652         vpsubusw(dst, src1, src2, vlen_enc);
6653       }
6654       break;
6655     default:
6656       fatal("Unsupported type %s", type2name(elem_bt));
6657       break;
6658   }
6659 }
6660 
6661 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6662                                                               XMMRegister src2, KRegister ktmp, int vlen_enc) {
6663   // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6664   // overflow_mask = Inp1 <u Inp2
6665   evpcmpu(elem_bt, ktmp,  src2, src1, Assembler::lt, vlen_enc);
6666   // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6667   evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6668 }
6669 
6670 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6671                                                               XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6672   // Emulate unsigned comparison using signed comparison
6673   // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6674   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6675   vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6676   vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6677 
6678   vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6679 
6680   // Res = INP1 - INP2 (non-commutative and non-associative)
6681   vpsub(elem_bt, dst, src1, src2, vlen_enc);
6682   // Res = Mask ? Zero : Res
6683   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6684   vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6685 }
6686 
6687 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6688                                                                XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6689   // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6690   // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6691   // Res = Signed Add INP1, INP2
6692   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6693   // T1 = SRC1 | SRC2
6694   vpor(xtmp1, src1, src2, vlen_enc);
6695   // Max_Unsigned = -1
6696   vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6697   // Unsigned compare:  Mask = Res <u T1
6698   evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6699   // res  = Mask ? Max_Unsigned : Res
6700   evpblend(elem_bt, dst, ktmp,  dst, xtmp2, true, vlen_enc);
6701 }
6702 
6703 //
6704 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6705 // unsigned addition operation.
6706 //    overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6707 //
6708 // We empirically determined its semantic equivalence to following reduced expression
6709 //    overflow_mask =  (a + b) <u (a | b)
6710 //
6711 // and also verified it though Alive2 solver.
6712 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6713 //
6714 
6715 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6716                                                               XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6717   // Res = Signed Add INP1, INP2
6718   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6719   // Compute T1 = INP1 | INP2
6720   vpor(xtmp3, src1, src2, vlen_enc);
6721   // T1 = Minimum signed value.
6722   vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6723   // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6724   vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6725   // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6726   vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6727   // Compute overflow detection mask = Res<1> <s T1
6728   if (elem_bt == T_INT) {
6729     vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6730   } else {
6731     assert(elem_bt == T_LONG, "");
6732     vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6733   }
6734   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6735 }
6736 
6737 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6738                                       int vlen_enc, bool xtmp2_hold_M1) {
6739   if (VM_Version::supports_avx512dq()) {
6740     evpmovq2m(ktmp, src, vlen_enc);
6741   } else {
6742     assert(VM_Version::supports_evex(), "");
6743     if (!xtmp2_hold_M1) {
6744       vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6745     }
6746     evpsraq(xtmp1, src, 63, vlen_enc);
6747     evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6748   }
6749 }
6750 
6751 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6752                                       int vlen_enc, bool xtmp2_hold_M1) {
6753   if (VM_Version::supports_avx512dq()) {
6754     evpmovd2m(ktmp, src, vlen_enc);
6755   } else {
6756     assert(VM_Version::supports_evex(), "");
6757     if (!xtmp2_hold_M1) {
6758       vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6759     }
6760     vpsrad(xtmp1, src, 31, vlen_enc);
6761     Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6762   }
6763 }
6764 
6765 
6766 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6767   if (elem_bt == T_LONG) {
6768     if (VM_Version::supports_evex()) {
6769       evpsraq(dst, src, 63, vlen_enc);
6770     } else {
6771       vpsrad(dst, src, 31, vlen_enc);
6772       vpshufd(dst, dst, 0xF5, vlen_enc);
6773     }
6774   } else {
6775     assert(elem_bt == T_INT, "");
6776     vpsrad(dst, src, 31, vlen_enc);
6777   }
6778 }
6779 
6780 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6781   if (compute_allones) {
6782     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6783       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6784     } else {
6785       vpcmpeqq(allones, allones, allones, vlen_enc);
6786     }
6787   }
6788   if (elem_bt == T_LONG) {
6789     vpsrlq(dst, allones, 1, vlen_enc);
6790   } else {
6791     assert(elem_bt == T_INT, "");
6792     vpsrld(dst, allones, 1, vlen_enc);
6793   }
6794 }
6795 
6796 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6797   if (compute_allones) {
6798     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6799       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6800     } else {
6801       vpcmpeqq(allones, allones, allones, vlen_enc);
6802     }
6803   }
6804   if (elem_bt == T_LONG) {
6805     vpsllq(dst, allones, 63, vlen_enc);
6806   } else {
6807     assert(elem_bt == T_INT, "");
6808     vpslld(dst, allones, 31, vlen_enc);
6809   }
6810 }
6811 
6812 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask,  XMMRegister src1, XMMRegister src2,
6813                                 Assembler::ComparisonPredicate cond, int vlen_enc) {
6814   switch(elem_bt) {
6815     case T_LONG:  evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6816     case T_INT:   evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6817     case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6818     case T_BYTE:  evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6819     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6820   }
6821 }
6822 
6823 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6824   switch(elem_bt) {
6825     case  T_LONG:  vpcmpgtq(dst, src1, src2, vlen_enc); break;
6826     case  T_INT:   vpcmpgtd(dst, src1, src2, vlen_enc); break;
6827     case  T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6828     case  T_BYTE:  vpcmpgtb(dst, src1, src2, vlen_enc); break;
6829     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6830   }
6831 }
6832 
6833 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6834                                            XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6835   if (elem_bt == T_LONG) {
6836     evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6837   } else {
6838     assert(elem_bt == T_INT, "");
6839     evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6840   }
6841 }
6842 
6843 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6844                                                          XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6845                                                          KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6846   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6847   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6848   // Overflow detection based on Hacker's delight section 2-13.
6849   if (ideal_opc == Op_SaturatingAddV) {
6850     // res = src1 + src2
6851     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6852     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6853     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6854     vpxor(xtmp1, dst, src1, vlen_enc);
6855     vpxor(xtmp2, dst, src2, vlen_enc);
6856     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6857   } else {
6858     assert(ideal_opc == Op_SaturatingSubV, "");
6859     // res = src1 - src2
6860     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6861     // Overflow occurs when both inputs have opposite polarity and
6862     // result polarity does not comply with first input polarity.
6863     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6864     vpxor(xtmp1, src1, src2, vlen_enc);
6865     vpxor(xtmp2, dst, src1, vlen_enc);
6866     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6867   }
6868 
6869   // Compute overflow detection mask.
6870   evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6871   // Note: xtmp1 hold -1 in all its lanes after above call.
6872 
6873   // Compute mask based on first input polarity.
6874   evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6875 
6876   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6877   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6878 
6879   // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6880   // set bits in first input polarity mask holds a min value.
6881   evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6882   // Blend destination lanes with saturated values using overflow detection mask.
6883   evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6884 }
6885 
6886 
6887 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6888                                                         XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6889                                                         XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6890   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6891   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6892   // Overflow detection based on Hacker's delight section 2-13.
6893   if (ideal_opc == Op_SaturatingAddV) {
6894     // res = src1 + src2
6895     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6896     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6897     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6898     vpxor(xtmp1, dst, src1, vlen_enc);
6899     vpxor(xtmp2, dst, src2, vlen_enc);
6900     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6901   } else {
6902     assert(ideal_opc == Op_SaturatingSubV, "");
6903     // res = src1 - src2
6904     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6905     // Overflow occurs when both inputs have opposite polarity and
6906     // result polarity does not comply with first input polarity.
6907     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6908     vpxor(xtmp1, src1, src2, vlen_enc);
6909     vpxor(xtmp2, dst, src1, vlen_enc);
6910     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6911   }
6912 
6913   // Sign-extend to compute overflow detection mask.
6914   vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
6915 
6916   vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
6917   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
6918   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6919 
6920   // Compose saturating min/max vector using first input polarity mask.
6921   vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
6922   vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
6923 
6924   // Blend result with saturating vector using overflow detection mask.
6925   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6926 }
6927 
6928 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6929   switch(elem_bt) {
6930     case T_BYTE:
6931       if (ideal_opc == Op_SaturatingAddV) {
6932         vpaddsb(dst, src1, src2, vlen_enc);
6933       } else {
6934         assert(ideal_opc == Op_SaturatingSubV, "");
6935         vpsubsb(dst, src1, src2, vlen_enc);
6936       }
6937       break;
6938     case T_SHORT:
6939       if (ideal_opc == Op_SaturatingAddV) {
6940         vpaddsw(dst, src1, src2, vlen_enc);
6941       } else {
6942         assert(ideal_opc == Op_SaturatingSubV, "");
6943         vpsubsw(dst, src1, src2, vlen_enc);
6944       }
6945       break;
6946     default:
6947       fatal("Unsupported type %s", type2name(elem_bt));
6948       break;
6949   }
6950 }
6951 
6952 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6953   switch(elem_bt) {
6954     case T_BYTE:
6955       if (ideal_opc == Op_SaturatingAddV) {
6956         vpaddusb(dst, src1, src2, vlen_enc);
6957       } else {
6958         assert(ideal_opc == Op_SaturatingSubV, "");
6959         vpsubusb(dst, src1, src2, vlen_enc);
6960       }
6961       break;
6962     case T_SHORT:
6963       if (ideal_opc == Op_SaturatingAddV) {
6964         vpaddusw(dst, src1, src2, vlen_enc);
6965       } else {
6966         assert(ideal_opc == Op_SaturatingSubV, "");
6967         vpsubusw(dst, src1, src2, vlen_enc);
6968       }
6969       break;
6970     default:
6971       fatal("Unsupported type %s", type2name(elem_bt));
6972       break;
6973   }
6974 }
6975 
6976 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6977                                                      XMMRegister src2, int vlen_enc) {
6978   switch(elem_bt) {
6979     case T_BYTE:
6980       evpermi2b(dst, src1, src2, vlen_enc);
6981       break;
6982     case T_SHORT:
6983       evpermi2w(dst, src1, src2, vlen_enc);
6984       break;
6985     case T_INT:
6986       evpermi2d(dst, src1, src2, vlen_enc);
6987       break;
6988     case T_LONG:
6989       evpermi2q(dst, src1, src2, vlen_enc);
6990       break;
6991     case T_FLOAT:
6992       evpermi2ps(dst, src1, src2, vlen_enc);
6993       break;
6994     case T_DOUBLE:
6995       evpermi2pd(dst, src1, src2, vlen_enc);
6996       break;
6997     default:
6998       fatal("Unsupported type %s", type2name(elem_bt));
6999       break;
7000   }
7001 }
7002 
7003 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
7004   if (is_unsigned) {
7005     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7006   } else {
7007     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7008   }
7009 }
7010 
7011 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
7012   if (is_unsigned) {
7013     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7014   } else {
7015     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7016   }
7017 }
7018 
7019 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
7020   switch(opcode) {
7021     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7022     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7023     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7024     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7025     default: assert(false, "%s", NodeClassNames[opcode]); break;
7026   }
7027 }
7028 
7029 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7030   switch(opcode) {
7031     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7032     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7033     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7034     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7035     default: assert(false, "%s", NodeClassNames[opcode]); break;
7036   }
7037 }
7038 
7039 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7040                                             KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) {
7041   vector_max_min_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit);
7042 }
7043 
7044 void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7045                                             KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
7046   if (opcode == Op_MaxVHF || opcode == Op_MaxHF) {
7047     // Move sign bits of src2 to mask register.
7048     evpmovw2m(ktmp, src2, vlen_enc);
7049     // xtmp1 = src2 < 0 ? src2 : src1
7050     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7051     // xtmp2 = src2 < 0 ? ? src1 : src2
7052     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7053     // Idea behind above swapping is to make seconds source operand a +ve value.
7054     // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in
7055     // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction,
7056     // the second source operand, either a NaN or a valid floating-point value, is returned
7057     // dst = max(xtmp1, xtmp2)
7058     evmaxph(dst, xtmp1, xtmp2, vlen_enc);
7059     // isNaN = is_unordered_quiet(xtmp1)
7060     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7061     // Final result is same as first source if its a NaN value,
7062     // in case second operand holds a NaN value then as per above semantics
7063     // result is same as second operand.
7064     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7065   } else {
7066     assert(opcode == Op_MinVHF || opcode == Op_MinHF, "");
7067     // Move sign bits of src1 to mask register.
7068     evpmovw2m(ktmp, src1, vlen_enc);
7069     // xtmp1 = src1 < 0 ? src2 : src1
7070     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7071     // xtmp2 = src1 < 0 ? src1 : src2
7072     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7073     // Idea behind above swapping is to make seconds source operand a -ve value.
7074     // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in
7075     // the second source operand is returned.
7076     // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN
7077     // or a valid floating-point value, is written to the result.
7078     // dst = min(xtmp1, xtmp2)
7079     evminph(dst, xtmp1, xtmp2, vlen_enc);
7080     // isNaN = is_unordered_quiet(xtmp1)
7081     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7082     // Final result is same as first source if its a NaN value,
7083     // in case second operand holds a NaN value then as per above semantics
7084     // result is same as second operand.
7085     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7086   }
7087 }