1 /*
   2  * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "gc/shared/barrierSet.hpp"
  28 #include "gc/shared/barrierSetAssembler.hpp"
  29 #include "oops/methodData.hpp"
  30 #include "opto/c2_MacroAssembler.hpp"
  31 #include "opto/intrinsicnode.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/opcodes.hpp"
  34 #include "opto/subnode.hpp"
  35 #include "runtime/globals.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 #include "utilities/checkedCast.hpp"
  39 #include "utilities/globalDefinitions.hpp"
  40 #include "utilities/powerOfTwo.hpp"
  41 #include "utilities/sizes.hpp"
  42 
  43 #ifdef PRODUCT
  44 #define BLOCK_COMMENT(str) /* nothing */
  45 #define STOP(error) stop(error)
  46 #else
  47 #define BLOCK_COMMENT(str) block_comment(str)
  48 #define STOP(error) block_comment(error); stop(error)
  49 #endif
  50 
  51 // C2 compiled method's prolog code.
  52 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) {
  53   if (C->clinit_barrier_on_entry()) {
  54     assert(VM_Version::supports_fast_class_init_checks(), "sanity");
  55     assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started");
  56 
  57     Label L_skip_barrier;
  58     Register klass = rscratch1;
  59 
  60     mov_metadata(klass, C->method()->holder()->constant_encoding());
  61     clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
  62 
  63     jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
  64 
  65     bind(L_skip_barrier);
  66   }
  67 
  68   int framesize = C->output()->frame_size_in_bytes();
  69   int bangsize = C->output()->bang_size_in_bytes();
  70   bool fp_mode_24b = false;
  71   int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0;
  72 
  73   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  74 
  75   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  76   // Remove word for return addr
  77   framesize -= wordSize;
  78   stack_bang_size -= wordSize;
  79 
  80   // Calls to C2R adapters often do not accept exceptional returns.
  81   // We require that their callers must bang for them.  But be careful, because
  82   // some VM calls (such as call site linkage) can use several kilobytes of
  83   // stack.  But the stack safety zone should account for that.
  84   // See bugs 4446381, 4468289, 4497237.
  85   if (stack_bang_size > 0) {
  86     generate_stack_overflow_check(stack_bang_size);
  87 
  88     // We always push rbp, so that on return to interpreter rbp, will be
  89     // restored correctly and we can correct the stack.
  90     push(rbp);
  91     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  92     if (PreserveFramePointer) {
  93       mov(rbp, rsp);
  94     }
  95     // Remove word for ebp
  96     framesize -= wordSize;
  97 
  98     // Create frame
  99     if (framesize) {
 100       subptr(rsp, framesize);
 101     }
 102   } else {
 103     subptr(rsp, framesize);
 104 
 105     // Save RBP register now.
 106     framesize -= wordSize;
 107     movptr(Address(rsp, framesize), rbp);
 108     // Save caller's stack pointer into RBP if the frame pointer is preserved.
 109     if (PreserveFramePointer) {
 110       movptr(rbp, rsp);
 111       if (framesize > 0) {
 112         addptr(rbp, framesize);
 113       }
 114     }
 115   }
 116 
 117   if (C->needs_stack_repair()) {
 118     // Save stack increment just below the saved rbp (also account for fixed framesize and rbp)
 119     assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned");
 120     movptr(Address(rsp, framesize - wordSize), sp_inc + framesize + wordSize);
 121   }
 122 
 123   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 124     framesize -= wordSize;
 125     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 126   }
 127 
 128 #ifdef ASSERT
 129   if (VerifyStackAtCalls) {
 130     Label L;
 131     push(rax);
 132     mov(rax, rsp);
 133     andptr(rax, StackAlignmentInBytes-1);
 134     cmpptr(rax, StackAlignmentInBytes-wordSize);
 135     pop(rax);
 136     jcc(Assembler::equal, L);
 137     STOP("Stack is not properly aligned!");
 138     bind(L);
 139   }
 140 #endif
 141 }
 142 
 143 void C2_MacroAssembler::entry_barrier() {
 144   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 145   // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 146   Label dummy_slow_path;
 147   Label dummy_continuation;
 148   Label* slow_path = &dummy_slow_path;
 149   Label* continuation = &dummy_continuation;
 150   if (!Compile::current()->output()->in_scratch_emit_size()) {
 151     // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 152     C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 153     Compile::current()->output()->add_stub(stub);
 154     slow_path = &stub->entry();
 155     continuation = &stub->continuation();
 156   }
 157   bs->nmethod_entry_barrier(this, slow_path, continuation);
 158 }
 159 
 160 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 161   switch (vlen_in_bytes) {
 162     case  4: // fall-through
 163     case  8: // fall-through
 164     case 16: return Assembler::AVX_128bit;
 165     case 32: return Assembler::AVX_256bit;
 166     case 64: return Assembler::AVX_512bit;
 167 
 168     default: {
 169       ShouldNotReachHere();
 170       return Assembler::AVX_NoVec;
 171     }
 172   }
 173 }
 174 
 175 // fast_lock and fast_unlock used by C2
 176 
 177 // Because the transitions from emitted code to the runtime
 178 // monitorenter/exit helper stubs are so slow it's critical that
 179 // we inline both the stack-locking fast path and the inflated fast path.
 180 //
 181 // See also: cmpFastLock and cmpFastUnlock.
 182 //
 183 // What follows is a specialized inline transliteration of the code
 184 // in enter() and exit(). If we're concerned about I$ bloat another
 185 // option would be to emit TrySlowEnter and TrySlowExit methods
 186 // at startup-time.  These methods would accept arguments as
 187 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 188 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 189 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 190 // In practice, however, the # of lock sites is bounded and is usually small.
 191 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 192 // if the processor uses simple bimodal branch predictors keyed by EIP
 193 // Since the helper routines would be called from multiple synchronization
 194 // sites.
 195 //
 196 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 197 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 198 // to those specialized methods.  That'd give us a mostly platform-independent
 199 // implementation that the JITs could optimize and inline at their pleasure.
 200 // Done correctly, the only time we'd need to cross to native could would be
 201 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 202 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 203 // (b) explicit barriers or fence operations.
 204 //
 205 // TODO:
 206 //
 207 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 208 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 209 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 210 //    the lock operators would typically be faster than reifying Self.
 211 //
 212 // *  Ideally I'd define the primitives as:
 213 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 214 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 215 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 216 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 217 //    Furthermore the register assignments are overconstrained, possibly resulting in
 218 //    sub-optimal code near the synchronization site.
 219 //
 220 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 221 //    Alternately, use a better sp-proximity test.
 222 //
 223 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 224 //    Either one is sufficient to uniquely identify a thread.
 225 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 226 //
 227 // *  Intrinsify notify() and notifyAll() for the common cases where the
 228 //    object is locked by the calling thread but the waitlist is empty.
 229 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 230 //
 231 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 232 //    But beware of excessive branch density on AMD Opterons.
 233 //
 234 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 235 //    or failure of the fast path.  If the fast path fails then we pass
 236 //    control to the slow path, typically in C.  In fast_lock and
 237 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 238 //    will emit a conditional branch immediately after the node.
 239 //    So we have branches to branches and lots of ICC.ZF games.
 240 //    Instead, it might be better to have C2 pass a "FailureLabel"
 241 //    into fast_lock and fast_unlock.  In the case of success, control
 242 //    will drop through the node.  ICC.ZF is undefined at exit.
 243 //    In the case of failure, the node will branch directly to the
 244 //    FailureLabel
 245 
 246 
 247 // obj: object to lock
 248 // box: on-stack box address -- KILLED
 249 // rax: tmp -- KILLED
 250 // t  : tmp -- KILLED
 251 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
 252                                               Register t, Register thread) {
 253   assert(rax_reg == rax, "Used for CAS");
 254   assert_different_registers(obj, box, rax_reg, t, thread);
 255 
 256   // Handle inflated monitor.
 257   Label inflated;
 258   // Finish fast lock successfully. ZF value is irrelevant.
 259   Label locked;
 260   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 261   Label slow_path;
 262 
 263   if (UseObjectMonitorTable) {
 264     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 265     movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
 266   }
 267 
 268   if (DiagnoseSyncOnValueBasedClasses != 0) {
 269     load_klass(rax_reg, obj, t);
 270     testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 271     jcc(Assembler::notZero, slow_path);
 272   }
 273 
 274   const Register mark = t;
 275 
 276   { // Lightweight Lock
 277 
 278     Label push;
 279 
 280     const Register top = UseObjectMonitorTable ? rax_reg : box;
 281 
 282     // Load the mark.
 283     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 284 
 285     // Prefetch top.
 286     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 287 
 288     // Check for monitor (0b10).
 289     testptr(mark, markWord::monitor_value);
 290     jcc(Assembler::notZero, inflated);
 291 
 292     // Check if lock-stack is full.
 293     cmpl(top, LockStack::end_offset() - 1);
 294     jcc(Assembler::greater, slow_path);
 295 
 296     // Check if recursive.
 297     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 298     jccb(Assembler::equal, push);
 299 
 300     // Try to lock. Transition lock bits 0b01 => 0b00
 301     movptr(rax_reg, mark);
 302     orptr(rax_reg, markWord::unlocked_value);
 303     andptr(mark, ~(int32_t)markWord::unlocked_value);
 304     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 305     jcc(Assembler::notEqual, slow_path);
 306 
 307     if (UseObjectMonitorTable) {
 308       // Need to reload top, clobbered by CAS.
 309       movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 310     }
 311     bind(push);
 312     // After successful lock, push object on lock-stack.
 313     movptr(Address(thread, top), obj);
 314     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 315     jmpb(locked);
 316   }
 317 
 318   { // Handle inflated monitor.
 319     bind(inflated);
 320 
 321     const Register monitor = t;
 322 
 323     if (!UseObjectMonitorTable) {
 324       assert(mark == monitor, "should be the same here");
 325     } else {
 326       // Uses ObjectMonitorTable.  Look for the monitor in the om_cache.
 327       // Fetch ObjectMonitor* from the cache or take the slow-path.
 328       Label monitor_found;
 329 
 330       // Load cache address
 331       lea(t, Address(thread, JavaThread::om_cache_oops_offset()));
 332 
 333       const int num_unrolled = 2;
 334       for (int i = 0; i < num_unrolled; i++) {
 335         cmpptr(obj, Address(t));
 336         jccb(Assembler::equal, monitor_found);
 337         increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 338       }
 339 
 340       Label loop;
 341 
 342       // Search for obj in cache.
 343       bind(loop);
 344 
 345       // Check for match.
 346       cmpptr(obj, Address(t));
 347       jccb(Assembler::equal, monitor_found);
 348 
 349       // Search until null encountered, guaranteed _null_sentinel at end.
 350       cmpptr(Address(t), 1);
 351       jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0
 352       increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 353       jmpb(loop);
 354 
 355       // Cache hit.
 356       bind(monitor_found);
 357       movptr(monitor, Address(t, OMCache::oop_to_monitor_difference()));
 358     }
 359     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 360     const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 361     const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
 362 
 363     Label monitor_locked;
 364     // Lock the monitor.
 365 
 366     if (UseObjectMonitorTable) {
 367       // Cache the monitor for unlock before trashing box. On failure to acquire
 368       // the lock, the slow path will reset the entry accordingly (see CacheSetter).
 369       movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
 370     }
 371 
 372     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 373     xorptr(rax_reg, rax_reg);
 374     movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
 375     lock(); cmpxchgptr(box, owner_address);
 376     jccb(Assembler::equal, monitor_locked);
 377 
 378     // Check if recursive.
 379     cmpptr(box, rax_reg);
 380     jccb(Assembler::notEqual, slow_path);
 381 
 382     // Recursive.
 383     increment(recursions_address);
 384 
 385     bind(monitor_locked);
 386   }
 387 
 388   bind(locked);
 389   // Set ZF = 1
 390   xorl(rax_reg, rax_reg);
 391 
 392 #ifdef ASSERT
 393   // Check that locked label is reached with ZF set.
 394   Label zf_correct;
 395   Label zf_bad_zero;
 396   jcc(Assembler::zero, zf_correct);
 397   jmp(zf_bad_zero);
 398 #endif
 399 
 400   bind(slow_path);
 401 #ifdef ASSERT
 402   // Check that slow_path label is reached with ZF not set.
 403   jcc(Assembler::notZero, zf_correct);
 404   stop("Fast Lock ZF != 0");
 405   bind(zf_bad_zero);
 406   stop("Fast Lock ZF != 1");
 407   bind(zf_correct);
 408 #endif
 409   // C2 uses the value of ZF to determine the continuation.
 410 }
 411 
 412 // obj: object to lock
 413 // rax: tmp -- KILLED
 414 // t  : tmp - cannot be obj nor rax -- KILLED
 415 //
 416 // Some commentary on balanced locking:
 417 //
 418 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 419 // Methods that don't have provably balanced locking are forced to run in the
 420 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 421 // The interpreter provides two properties:
 422 // I1:  At return-time the interpreter automatically and quietly unlocks any
 423 //      objects acquired in the current activation (frame).  Recall that the
 424 //      interpreter maintains an on-stack list of locks currently held by
 425 //      a frame.
 426 // I2:  If a method attempts to unlock an object that is not held by the
 427 //      frame the interpreter throws IMSX.
 428 //
 429 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 430 // B() doesn't have provably balanced locking so it runs in the interpreter.
 431 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 432 // is still locked by A().
 433 //
 434 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface
 435 // Specification" states that an object locked by JNI's MonitorEnter should not be
 436 // unlocked by "normal" java-level locking and vice-versa.  The specification doesn't
 437 // specify what will occur if a program engages in such mixed-mode locking, however.
 438 // Arguably given that the spec legislates the JNI case as undefined our implementation
 439 // could reasonably *avoid* checking owner in fast_unlock().
 440 // In the interest of performance we elide m->Owner==Self check in unlock.
 441 // A perfectly viable alternative is to elide the owner check except when
 442 // Xcheck:jni is enabled.
 443 
 444 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) {
 445   assert(reg_rax == rax, "Used for CAS");
 446   assert_different_registers(obj, reg_rax, t);
 447 
 448   // Handle inflated monitor.
 449   Label inflated, inflated_check_lock_stack;
 450   // Finish fast unlock successfully.  MUST jump with ZF == 1
 451   Label unlocked, slow_path;
 452 
 453   const Register mark = t;
 454   const Register monitor = t;
 455   const Register top = UseObjectMonitorTable ? t : reg_rax;
 456   const Register box = reg_rax;
 457 
 458   Label dummy;
 459   C2FastUnlockLightweightStub* stub = nullptr;
 460 
 461   if (!Compile::current()->output()->in_scratch_emit_size()) {
 462     stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread);
 463     Compile::current()->output()->add_stub(stub);
 464   }
 465 
 466   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 467 
 468   { // Lightweight Unlock
 469 
 470     // Load top.
 471     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 472 
 473     if (!UseObjectMonitorTable) {
 474       // Prefetch mark.
 475       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 476     }
 477 
 478     // Check if obj is top of lock-stack.
 479     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 480     // Top of lock stack was not obj. Must be monitor.
 481     jcc(Assembler::notEqual, inflated_check_lock_stack);
 482 
 483     // Pop lock-stack.
 484     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 485     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 486 
 487     // Check if recursive.
 488     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 489     jcc(Assembler::equal, unlocked);
 490 
 491     // We elide the monitor check, let the CAS fail instead.
 492 
 493     if (UseObjectMonitorTable) {
 494       // Load mark.
 495       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 496     }
 497 
 498     // Try to unlock. Transition lock bits 0b00 => 0b01
 499     movptr(reg_rax, mark);
 500     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
 501     orptr(mark, markWord::unlocked_value);
 502     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 503     jcc(Assembler::notEqual, push_and_slow_path);
 504     jmp(unlocked);
 505   }
 506 
 507 
 508   { // Handle inflated monitor.
 509     bind(inflated_check_lock_stack);
 510 #ifdef ASSERT
 511     Label check_done;
 512     subl(top, oopSize);
 513     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
 514     jcc(Assembler::below, check_done);
 515     cmpptr(obj, Address(thread, top));
 516     jccb(Assembler::notEqual, inflated_check_lock_stack);
 517     stop("Fast Unlock lock on stack");
 518     bind(check_done);
 519     if (UseObjectMonitorTable) {
 520       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 521     }
 522     testptr(mark, markWord::monitor_value);
 523     jccb(Assembler::notZero, inflated);
 524     stop("Fast Unlock not monitor");
 525 #endif
 526 
 527     bind(inflated);
 528 
 529     if (!UseObjectMonitorTable) {
 530       assert(mark == monitor, "should be the same here");
 531     } else {
 532       // Uses ObjectMonitorTable.  Look for the monitor in our BasicLock on the stack.
 533       movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 534       // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
 535       cmpptr(monitor, alignof(ObjectMonitor*));
 536       jcc(Assembler::below, slow_path);
 537     }
 538     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 539     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
 540     const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
 541     const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag};
 542     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
 543 
 544     Label recursive;
 545 
 546     // Check if recursive.
 547     cmpptr(recursions_address, 0);
 548     jccb(Assembler::notZero, recursive);
 549 
 550     // Set owner to null.
 551     // Release to satisfy the JMM
 552     movptr(owner_address, NULL_WORD);
 553     // We need a full fence after clearing owner to avoid stranding.
 554     // StoreLoad achieves this.
 555     membar(StoreLoad);
 556 
 557     // Check if the entry_list is empty.
 558     cmpptr(entry_list_address, NULL_WORD);
 559     jccb(Assembler::zero, unlocked);    // If so we are done.
 560 
 561     // Check if there is a successor.
 562     cmpptr(succ_address, NULL_WORD);
 563     jccb(Assembler::notZero, unlocked); // If so we are done.
 564 
 565     // Save the monitor pointer in the current thread, so we can try to
 566     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 567     if (!UseObjectMonitorTable) {
 568       andptr(monitor, ~(int32_t)markWord::monitor_value);
 569     }
 570     movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
 571 
 572     orl(t, 1); // Fast Unlock ZF = 0
 573     jmpb(slow_path);
 574 
 575     // Recursive unlock.
 576     bind(recursive);
 577     decrement(recursions_address);
 578   }
 579 
 580   bind(unlocked);
 581   xorl(t, t); // Fast Unlock ZF = 1
 582 
 583 #ifdef ASSERT
 584   // Check that unlocked label is reached with ZF set.
 585   Label zf_correct;
 586   Label zf_bad_zero;
 587   jcc(Assembler::zero, zf_correct);
 588   jmp(zf_bad_zero);
 589 #endif
 590 
 591   bind(slow_path);
 592   if (stub != nullptr) {
 593     bind(stub->slow_path_continuation());
 594   }
 595 #ifdef ASSERT
 596   // Check that stub->continuation() label is reached with ZF not set.
 597   jcc(Assembler::notZero, zf_correct);
 598   stop("Fast Unlock ZF != 0");
 599   bind(zf_bad_zero);
 600   stop("Fast Unlock ZF != 1");
 601   bind(zf_correct);
 602 #endif
 603   // C2 uses the value of ZF to determine the continuation.
 604 }
 605 
 606 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
 607   fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
 608 }
 609 
 610 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) {
 611   const int framesize = Compile::current()->output()->frame_size_in_bytes();
 612   masm->movptr(dst, rsp);
 613   if (framesize > 2 * wordSize) {
 614     masm->addptr(dst, framesize - 2 * wordSize);
 615   }
 616 }
 617 
 618 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
 619   if (PreserveFramePointer) {
 620     // frame pointer is valid
 621 #ifdef ASSERT
 622     // Verify frame pointer value in rbp.
 623     reconstruct_frame_pointer_helper(this, rtmp);
 624     Label L_success;
 625     cmpq(rbp, rtmp);
 626     jccb(Assembler::equal, L_success);
 627     STOP("frame pointer mismatch");
 628     bind(L_success);
 629 #endif // ASSERT
 630   } else {
 631     reconstruct_frame_pointer_helper(this, rbp);
 632   }
 633 }
 634 
 635 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) {
 636   jint lo = t->_lo;
 637   jint hi = t->_hi;
 638   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi);
 639   if (t == TypeInt::INT) {
 640     return;
 641   }
 642 
 643   BLOCK_COMMENT("CastII {");
 644   Label fail;
 645   Label succeed;
 646   if (hi == max_jint) {
 647     cmpl(val, lo);
 648     jccb(Assembler::greaterEqual, succeed);
 649   } else {
 650     if (lo != min_jint) {
 651       cmpl(val, lo);
 652       jccb(Assembler::less, fail);
 653     }
 654     cmpl(val, hi);
 655     jccb(Assembler::lessEqual, succeed);
 656   }
 657 
 658   bind(fail);
 659   movl(c_rarg0, idx);
 660   movl(c_rarg1, val);
 661   movl(c_rarg2, lo);
 662   movl(c_rarg3, hi);
 663   reconstruct_frame_pointer(rscratch1);
 664   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range)));
 665   hlt();
 666   bind(succeed);
 667   BLOCK_COMMENT("} // CastII");
 668 }
 669 
 670 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
 671   fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
 672 }
 673 
 674 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) {
 675   jlong lo = t->_lo;
 676   jlong hi = t->_hi;
 677   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi);
 678   if (t == TypeLong::LONG) {
 679     return;
 680   }
 681 
 682   BLOCK_COMMENT("CastLL {");
 683   Label fail;
 684   Label succeed;
 685 
 686   auto cmp_val = [&](jlong bound) {
 687     if (is_simm32(bound)) {
 688       cmpq(val, checked_cast<int>(bound));
 689     } else {
 690       mov64(tmp, bound);
 691       cmpq(val, tmp);
 692     }
 693   };
 694 
 695   if (hi == max_jlong) {
 696     cmp_val(lo);
 697     jccb(Assembler::greaterEqual, succeed);
 698   } else {
 699     if (lo != min_jlong) {
 700       cmp_val(lo);
 701       jccb(Assembler::less, fail);
 702     }
 703     cmp_val(hi);
 704     jccb(Assembler::lessEqual, succeed);
 705   }
 706 
 707   bind(fail);
 708   movl(c_rarg0, idx);
 709   movq(c_rarg1, val);
 710   mov64(c_rarg2, lo);
 711   mov64(c_rarg3, hi);
 712   reconstruct_frame_pointer(rscratch1);
 713   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range)));
 714   hlt();
 715   bind(succeed);
 716   BLOCK_COMMENT("} // CastLL");
 717 }
 718 
 719 //-------------------------------------------------------------------------------------------
 720 // Generic instructions support for use in .ad files C2 code generation
 721 
 722 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 723   if (dst != src) {
 724     movdqu(dst, src);
 725   }
 726   if (opcode == Op_AbsVD) {
 727     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 728   } else {
 729     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 730     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 731   }
 732 }
 733 
 734 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 735   if (opcode == Op_AbsVD) {
 736     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 737   } else {
 738     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 739     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 740   }
 741 }
 742 
 743 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 744   if (dst != src) {
 745     movdqu(dst, src);
 746   }
 747   if (opcode == Op_AbsVF) {
 748     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 749   } else {
 750     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 751     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 752   }
 753 }
 754 
 755 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 756   if (opcode == Op_AbsVF) {
 757     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 758   } else {
 759     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 760     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 761   }
 762 }
 763 
 764 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 765   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 766   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 767 
 768   if (opcode == Op_MinV) {
 769     if (elem_bt == T_BYTE) {
 770       pminsb(dst, src);
 771     } else if (elem_bt == T_SHORT) {
 772       pminsw(dst, src);
 773     } else if (elem_bt == T_INT) {
 774       pminsd(dst, src);
 775     } else {
 776       assert(elem_bt == T_LONG, "required");
 777       assert(tmp == xmm0, "required");
 778       assert_different_registers(dst, src, tmp);
 779       movdqu(xmm0, dst);
 780       pcmpgtq(xmm0, src);
 781       blendvpd(dst, src);  // xmm0 as mask
 782     }
 783   } else { // opcode == Op_MaxV
 784     if (elem_bt == T_BYTE) {
 785       pmaxsb(dst, src);
 786     } else if (elem_bt == T_SHORT) {
 787       pmaxsw(dst, src);
 788     } else if (elem_bt == T_INT) {
 789       pmaxsd(dst, src);
 790     } else {
 791       assert(elem_bt == T_LONG, "required");
 792       assert(tmp == xmm0, "required");
 793       assert_different_registers(dst, src, tmp);
 794       movdqu(xmm0, src);
 795       pcmpgtq(xmm0, dst);
 796       blendvpd(dst, src);  // xmm0 as mask
 797     }
 798   }
 799 }
 800 
 801 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 802                                   XMMRegister src1, Address src2, int vlen_enc) {
 803   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 804   if (opcode == Op_UMinV) {
 805     switch(elem_bt) {
 806       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 807       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 808       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 809       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 810       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 811     }
 812   } else {
 813     assert(opcode == Op_UMaxV, "required");
 814     switch(elem_bt) {
 815       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 816       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 817       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 818       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 819       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 820     }
 821   }
 822 }
 823 
 824 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
 825   // For optimality, leverage a full vector width of 512 bits
 826   // for operations over smaller vector sizes on AVX512 targets.
 827   if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 828     if (opcode == Op_UMaxV) {
 829       evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 830     } else {
 831       assert(opcode == Op_UMinV, "required");
 832       evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 833     }
 834   } else {
 835     // T1 = -1
 836     vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
 837     // T1 = -1 << 63
 838     vpsllq(xtmp1, xtmp1, 63, vlen_enc);
 839     // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
 840     vpaddq(xtmp2, xtmp1, src2, vlen_enc);
 841     // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
 842     vpaddq(xtmp1, xtmp1, src1, vlen_enc);
 843     // Mask = T2 > T1
 844     vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
 845     if (opcode == Op_UMaxV) {
 846       // Res = Mask ? Src2 : Src1
 847       vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
 848     } else {
 849       // Res = Mask ? Src1 : Src2
 850       vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
 851     }
 852   }
 853 }
 854 
 855 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 856                                   XMMRegister src1, XMMRegister src2, int vlen_enc) {
 857   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 858   if (opcode == Op_UMinV) {
 859     switch(elem_bt) {
 860       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 861       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 862       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 863       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 864       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 865     }
 866   } else {
 867     assert(opcode == Op_UMaxV, "required");
 868     switch(elem_bt) {
 869       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 870       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 871       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 872       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 873       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 874     }
 875   }
 876 }
 877 
 878 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 879                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 880                                  int vlen_enc) {
 881   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 882 
 883   if (opcode == Op_MinV) {
 884     if (elem_bt == T_BYTE) {
 885       vpminsb(dst, src1, src2, vlen_enc);
 886     } else if (elem_bt == T_SHORT) {
 887       vpminsw(dst, src1, src2, vlen_enc);
 888     } else if (elem_bt == T_INT) {
 889       vpminsd(dst, src1, src2, vlen_enc);
 890     } else {
 891       assert(elem_bt == T_LONG, "required");
 892       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 893         vpminsq(dst, src1, src2, vlen_enc);
 894       } else {
 895         assert_different_registers(dst, src1, src2);
 896         vpcmpgtq(dst, src1, src2, vlen_enc);
 897         vblendvpd(dst, src1, src2, dst, vlen_enc);
 898       }
 899     }
 900   } else { // opcode == Op_MaxV
 901     if (elem_bt == T_BYTE) {
 902       vpmaxsb(dst, src1, src2, vlen_enc);
 903     } else if (elem_bt == T_SHORT) {
 904       vpmaxsw(dst, src1, src2, vlen_enc);
 905     } else if (elem_bt == T_INT) {
 906       vpmaxsd(dst, src1, src2, vlen_enc);
 907     } else {
 908       assert(elem_bt == T_LONG, "required");
 909       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 910         vpmaxsq(dst, src1, src2, vlen_enc);
 911       } else {
 912         assert_different_registers(dst, src1, src2);
 913         vpcmpgtq(dst, src1, src2, vlen_enc);
 914         vblendvpd(dst, src2, src1, dst, vlen_enc);
 915       }
 916     }
 917   }
 918 }
 919 
 920 // Float/Double min max
 921 
 922 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
 923                                    XMMRegister dst, XMMRegister a, XMMRegister b,
 924                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
 925                                    int vlen_enc) {
 926   assert(UseAVX > 0, "required");
 927   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 928          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 929   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 930   assert_different_registers(a, tmp, atmp, btmp);
 931   assert_different_registers(b, tmp, atmp, btmp);
 932 
 933   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 934   bool is_double_word = is_double_word_type(elem_bt);
 935 
 936   /* Note on 'non-obvious' assembly sequence:
 937    *
 938    * While there are vminps/vmaxps instructions, there are two important differences between hardware
 939    * and Java on how they handle floats:
 940    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
 941    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
 942    *
 943    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
 944    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
 945    *                (only useful when signs differ, noop otherwise)
 946    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
 947 
 948    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
 949    *   btmp = (b < +0.0) ? a : b
 950    *   atmp = (b < +0.0) ? b : a
 951    *   Tmp  = Max_Float(atmp , btmp)
 952    *   Res  = (atmp == NaN) ? atmp : Tmp
 953    */
 954 
 955   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
 956   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
 957   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
 958   XMMRegister mask;
 959 
 960   if (!is_double_word && is_min) {
 961     mask = a;
 962     vblend = &MacroAssembler::vblendvps;
 963     vmaxmin = &MacroAssembler::vminps;
 964     vcmp = &MacroAssembler::vcmpps;
 965   } else if (!is_double_word && !is_min) {
 966     mask = b;
 967     vblend = &MacroAssembler::vblendvps;
 968     vmaxmin = &MacroAssembler::vmaxps;
 969     vcmp = &MacroAssembler::vcmpps;
 970   } else if (is_double_word && is_min) {
 971     mask = a;
 972     vblend = &MacroAssembler::vblendvpd;
 973     vmaxmin = &MacroAssembler::vminpd;
 974     vcmp = &MacroAssembler::vcmppd;
 975   } else {
 976     assert(is_double_word && !is_min, "sanity");
 977     mask = b;
 978     vblend = &MacroAssembler::vblendvpd;
 979     vmaxmin = &MacroAssembler::vmaxpd;
 980     vcmp = &MacroAssembler::vcmppd;
 981   }
 982 
 983   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
 984   XMMRegister maxmin, scratch;
 985   if (dst == btmp) {
 986     maxmin = btmp;
 987     scratch = tmp;
 988   } else {
 989     maxmin = tmp;
 990     scratch = btmp;
 991   }
 992 
 993   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
 994   if (precompute_mask && !is_double_word) {
 995     vpsrad(tmp, mask, 32, vlen_enc);
 996     mask = tmp;
 997   } else if (precompute_mask && is_double_word) {
 998     vpxor(tmp, tmp, tmp, vlen_enc);
 999     vpcmpgtq(tmp, tmp, mask, vlen_enc);
1000     mask = tmp;
1001   }
1002 
1003   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1004   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1005   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1006   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1007   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1008 }
1009 
1010 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1011                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1012                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1013                                     int vlen_enc) {
1014   assert(UseAVX > 2, "required");
1015   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1016          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1017   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1018   assert_different_registers(dst, a, atmp, btmp);
1019   assert_different_registers(dst, b, atmp, btmp);
1020 
1021   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1022   bool is_double_word = is_double_word_type(elem_bt);
1023   bool merge = true;
1024 
1025   if (!is_double_word && is_min) {
1026     evpmovd2m(ktmp, a, vlen_enc);
1027     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1028     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1029     vminps(dst, atmp, btmp, vlen_enc);
1030     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1031     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1032   } else if (!is_double_word && !is_min) {
1033     evpmovd2m(ktmp, b, vlen_enc);
1034     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1035     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1036     vmaxps(dst, atmp, btmp, vlen_enc);
1037     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1038     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1039   } else if (is_double_word && is_min) {
1040     evpmovq2m(ktmp, a, vlen_enc);
1041     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1042     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1043     vminpd(dst, atmp, btmp, vlen_enc);
1044     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1045     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1046   } else {
1047     assert(is_double_word && !is_min, "sanity");
1048     evpmovq2m(ktmp, b, vlen_enc);
1049     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1050     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1051     vmaxpd(dst, atmp, btmp, vlen_enc);
1052     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1053     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1054   }
1055 }
1056 
1057 void C2_MacroAssembler::vminmax_fp(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1058                                    XMMRegister src1, XMMRegister src2, int vlen_enc) {
1059   assert(opc == Op_MinV || opc == Op_MinReductionV ||
1060          opc == Op_MaxV || opc == Op_MaxReductionV, "sanity");
1061 
1062   int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_MINMAX_MIN_COMPARE_SIGN
1063                                                          : AVX10_MINMAX_MAX_COMPARE_SIGN;
1064   if (elem_bt == T_FLOAT) {
1065     evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc);
1066   } else {
1067     assert(elem_bt == T_DOUBLE, "");
1068     evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc);
1069   }
1070 }
1071 
1072 // Float/Double signum
1073 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1074   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1075 
1076   Label DONE_LABEL;
1077 
1078   if (opcode == Op_SignumF) {
1079     ucomiss(dst, zero);
1080     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1081     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1082     movflt(dst, one);
1083     jcc(Assembler::above, DONE_LABEL);
1084     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1085   } else if (opcode == Op_SignumD) {
1086     ucomisd(dst, zero);
1087     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1088     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1089     movdbl(dst, one);
1090     jcc(Assembler::above, DONE_LABEL);
1091     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1092   }
1093 
1094   bind(DONE_LABEL);
1095 }
1096 
1097 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1098   if (sign) {
1099     pmovsxbw(dst, src);
1100   } else {
1101     pmovzxbw(dst, src);
1102   }
1103 }
1104 
1105 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1106   if (sign) {
1107     vpmovsxbw(dst, src, vector_len);
1108   } else {
1109     vpmovzxbw(dst, src, vector_len);
1110   }
1111 }
1112 
1113 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1114   if (sign) {
1115     vpmovsxbd(dst, src, vector_len);
1116   } else {
1117     vpmovzxbd(dst, src, vector_len);
1118   }
1119 }
1120 
1121 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1122   if (sign) {
1123     vpmovsxwd(dst, src, vector_len);
1124   } else {
1125     vpmovzxwd(dst, src, vector_len);
1126   }
1127 }
1128 
1129 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1130                                      int shift, int vector_len) {
1131   if (opcode == Op_RotateLeftV) {
1132     if (etype == T_INT) {
1133       evprold(dst, src, shift, vector_len);
1134     } else {
1135       assert(etype == T_LONG, "expected type T_LONG");
1136       evprolq(dst, src, shift, vector_len);
1137     }
1138   } else {
1139     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1140     if (etype == T_INT) {
1141       evprord(dst, src, shift, vector_len);
1142     } else {
1143       assert(etype == T_LONG, "expected type T_LONG");
1144       evprorq(dst, src, shift, vector_len);
1145     }
1146   }
1147 }
1148 
1149 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1150                                      XMMRegister shift, int vector_len) {
1151   if (opcode == Op_RotateLeftV) {
1152     if (etype == T_INT) {
1153       evprolvd(dst, src, shift, vector_len);
1154     } else {
1155       assert(etype == T_LONG, "expected type T_LONG");
1156       evprolvq(dst, src, shift, vector_len);
1157     }
1158   } else {
1159     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1160     if (etype == T_INT) {
1161       evprorvd(dst, src, shift, vector_len);
1162     } else {
1163       assert(etype == T_LONG, "expected type T_LONG");
1164       evprorvq(dst, src, shift, vector_len);
1165     }
1166   }
1167 }
1168 
1169 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1170   if (opcode == Op_RShiftVI) {
1171     psrad(dst, shift);
1172   } else if (opcode == Op_LShiftVI) {
1173     pslld(dst, shift);
1174   } else {
1175     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1176     psrld(dst, shift);
1177   }
1178 }
1179 
1180 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1181   switch (opcode) {
1182     case Op_RShiftVI:  psrad(dst, shift); break;
1183     case Op_LShiftVI:  pslld(dst, shift); break;
1184     case Op_URShiftVI: psrld(dst, shift); break;
1185 
1186     default: assert(false, "%s", NodeClassNames[opcode]);
1187   }
1188 }
1189 
1190 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1191   if (opcode == Op_RShiftVI) {
1192     vpsrad(dst, nds, shift, vector_len);
1193   } else if (opcode == Op_LShiftVI) {
1194     vpslld(dst, nds, shift, vector_len);
1195   } else {
1196     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1197     vpsrld(dst, nds, shift, vector_len);
1198   }
1199 }
1200 
1201 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1202   switch (opcode) {
1203     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1204     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1205     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1206 
1207     default: assert(false, "%s", NodeClassNames[opcode]);
1208   }
1209 }
1210 
1211 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1212   switch (opcode) {
1213     case Op_RShiftVB:  // fall-through
1214     case Op_RShiftVS:  psraw(dst, shift); break;
1215 
1216     case Op_LShiftVB:  // fall-through
1217     case Op_LShiftVS:  psllw(dst, shift);   break;
1218 
1219     case Op_URShiftVS: // fall-through
1220     case Op_URShiftVB: psrlw(dst, shift);  break;
1221 
1222     default: assert(false, "%s", NodeClassNames[opcode]);
1223   }
1224 }
1225 
1226 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1227   switch (opcode) {
1228     case Op_RShiftVB:  // fall-through
1229     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1230 
1231     case Op_LShiftVB:  // fall-through
1232     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1233 
1234     case Op_URShiftVS: // fall-through
1235     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1236 
1237     default: assert(false, "%s", NodeClassNames[opcode]);
1238   }
1239 }
1240 
1241 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1242   switch (opcode) {
1243     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1244     case Op_LShiftVL:  psllq(dst, shift); break;
1245     case Op_URShiftVL: psrlq(dst, shift); break;
1246 
1247     default: assert(false, "%s", NodeClassNames[opcode]);
1248   }
1249 }
1250 
1251 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1252   if (opcode == Op_RShiftVL) {
1253     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1254   } else if (opcode == Op_LShiftVL) {
1255     psllq(dst, shift);
1256   } else {
1257     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1258     psrlq(dst, shift);
1259   }
1260 }
1261 
1262 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1263   switch (opcode) {
1264     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1265     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1266     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1267 
1268     default: assert(false, "%s", NodeClassNames[opcode]);
1269   }
1270 }
1271 
1272 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1273   if (opcode == Op_RShiftVL) {
1274     evpsraq(dst, nds, shift, vector_len);
1275   } else if (opcode == Op_LShiftVL) {
1276     vpsllq(dst, nds, shift, vector_len);
1277   } else {
1278     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1279     vpsrlq(dst, nds, shift, vector_len);
1280   }
1281 }
1282 
1283 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1284   switch (opcode) {
1285     case Op_RShiftVB:  // fall-through
1286     case Op_RShiftVS:  // fall-through
1287     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1288 
1289     case Op_LShiftVB:  // fall-through
1290     case Op_LShiftVS:  // fall-through
1291     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1292 
1293     case Op_URShiftVB: // fall-through
1294     case Op_URShiftVS: // fall-through
1295     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1296 
1297     default: assert(false, "%s", NodeClassNames[opcode]);
1298   }
1299 }
1300 
1301 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1302   switch (opcode) {
1303     case Op_RShiftVB:  // fall-through
1304     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1305 
1306     case Op_LShiftVB:  // fall-through
1307     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1308 
1309     case Op_URShiftVB: // fall-through
1310     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1311 
1312     default: assert(false, "%s", NodeClassNames[opcode]);
1313   }
1314 }
1315 
1316 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1317   assert(UseAVX >= 2, "required");
1318   switch (opcode) {
1319     case Op_RShiftVL: {
1320       if (UseAVX > 2) {
1321         assert(tmp == xnoreg, "not used");
1322         if (!VM_Version::supports_avx512vl()) {
1323           vlen_enc = Assembler::AVX_512bit;
1324         }
1325         evpsravq(dst, src, shift, vlen_enc);
1326       } else {
1327         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1328         vpsrlvq(dst, src, shift, vlen_enc);
1329         vpsrlvq(tmp, tmp, shift, vlen_enc);
1330         vpxor(dst, dst, tmp, vlen_enc);
1331         vpsubq(dst, dst, tmp, vlen_enc);
1332       }
1333       break;
1334     }
1335     case Op_LShiftVL: {
1336       assert(tmp == xnoreg, "not used");
1337       vpsllvq(dst, src, shift, vlen_enc);
1338       break;
1339     }
1340     case Op_URShiftVL: {
1341       assert(tmp == xnoreg, "not used");
1342       vpsrlvq(dst, src, shift, vlen_enc);
1343       break;
1344     }
1345     default: assert(false, "%s", NodeClassNames[opcode]);
1346   }
1347 }
1348 
1349 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1350 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1351   assert(opcode == Op_LShiftVB ||
1352          opcode == Op_RShiftVB ||
1353          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1354   bool sign = (opcode != Op_URShiftVB);
1355   assert(vector_len == 0, "required");
1356   vextendbd(sign, dst, src, 1);
1357   vpmovzxbd(vtmp, shift, 1);
1358   varshiftd(opcode, dst, dst, vtmp, 1);
1359   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1360   vextracti128_high(vtmp, dst);
1361   vpackusdw(dst, dst, vtmp, 0);
1362 }
1363 
1364 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1365 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1366   assert(opcode == Op_LShiftVB ||
1367          opcode == Op_RShiftVB ||
1368          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1369   bool sign = (opcode != Op_URShiftVB);
1370   int ext_vector_len = vector_len + 1;
1371   vextendbw(sign, dst, src, ext_vector_len);
1372   vpmovzxbw(vtmp, shift, ext_vector_len);
1373   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1374   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1375   if (vector_len == 0) {
1376     vextracti128_high(vtmp, dst);
1377     vpackuswb(dst, dst, vtmp, vector_len);
1378   } else {
1379     vextracti64x4_high(vtmp, dst);
1380     vpackuswb(dst, dst, vtmp, vector_len);
1381     vpermq(dst, dst, 0xD8, vector_len);
1382   }
1383 }
1384 
1385 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1386   switch(typ) {
1387     case T_BYTE:
1388       pinsrb(dst, val, idx);
1389       break;
1390     case T_SHORT:
1391       pinsrw(dst, val, idx);
1392       break;
1393     case T_INT:
1394       pinsrd(dst, val, idx);
1395       break;
1396     case T_LONG:
1397       pinsrq(dst, val, idx);
1398       break;
1399     default:
1400       assert(false,"Should not reach here.");
1401       break;
1402   }
1403 }
1404 
1405 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1406   switch(typ) {
1407     case T_BYTE:
1408       vpinsrb(dst, src, val, idx);
1409       break;
1410     case T_SHORT:
1411       vpinsrw(dst, src, val, idx);
1412       break;
1413     case T_INT:
1414       vpinsrd(dst, src, val, idx);
1415       break;
1416     case T_LONG:
1417       vpinsrq(dst, src, val, idx);
1418       break;
1419     default:
1420       assert(false,"Should not reach here.");
1421       break;
1422   }
1423 }
1424 
1425 void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst,
1426                                          Register base, Register idx_base,
1427                                          Register mask, Register mask_idx,
1428                                          Register rtmp, int vlen_enc) {
1429   vpxor(dst, dst, dst, vlen_enc);
1430   if (elem_bt == T_SHORT) {
1431     for (int i = 0; i < 4; i++) {
1432       // dst[i] = mask[i] ? src[idx_base[i]] : 0
1433       Label skip_load;
1434       btq(mask, mask_idx);
1435       jccb(Assembler::carryClear, skip_load);
1436       movl(rtmp, Address(idx_base, i * 4));
1437       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1438       bind(skip_load);
1439       incq(mask_idx);
1440     }
1441   } else {
1442     assert(elem_bt == T_BYTE, "");
1443     for (int i = 0; i < 8; i++) {
1444       // dst[i] = mask[i] ? src[idx_base[i]] : 0
1445       Label skip_load;
1446       btq(mask, mask_idx);
1447       jccb(Assembler::carryClear, skip_load);
1448       movl(rtmp, Address(idx_base, i * 4));
1449       pinsrb(dst, Address(base, rtmp), i);
1450       bind(skip_load);
1451       incq(mask_idx);
1452     }
1453   }
1454 }
1455 
1456 void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst,
1457                                   Register base, Register idx_base,
1458                                   Register rtmp, int vlen_enc) {
1459   vpxor(dst, dst, dst, vlen_enc);
1460   if (elem_bt == T_SHORT) {
1461     for (int i = 0; i < 4; i++) {
1462       // dst[i] = src[idx_base[i]]
1463       movl(rtmp, Address(idx_base, i * 4));
1464       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1465     }
1466   } else {
1467     assert(elem_bt == T_BYTE, "");
1468     for (int i = 0; i < 8; i++) {
1469       // dst[i] = src[idx_base[i]]
1470       movl(rtmp, Address(idx_base, i * 4));
1471       pinsrb(dst, Address(base, rtmp), i);
1472     }
1473   }
1474 }
1475 
1476 /*
1477  * Gather using hybrid algorithm, first partially unroll scalar loop
1478  * to accumulate values from gather indices into a quad-word(64bit) slice.
1479  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1480  * permutation to place the slice into appropriate vector lane
1481  * locations in destination vector. Following pseudo code describes the
1482  * algorithm in detail:
1483  *
1484  * DST_VEC = ZERO_VEC
1485  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1486  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1487  * FOREACH_ITER:
1488  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1489  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1490  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1491  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1492  *
1493  * With each iteration, doubleword permute indices (0,1) corresponding
1494  * to gathered quadword gets right shifted by two lane positions.
1495  *
1496  */
1497 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1498                                         Register base, Register idx_base,
1499                                         Register mask, XMMRegister xtmp1,
1500                                         XMMRegister xtmp2, XMMRegister temp_dst,
1501                                         Register rtmp, Register mask_idx,
1502                                         Register length, int vector_len, int vlen_enc) {
1503   Label GATHER8_LOOP;
1504   assert(is_subword_type(elem_ty), "");
1505   movl(length, vector_len);
1506   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1507   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1508   vallones(xtmp2, vlen_enc);
1509   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1510   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1511   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1512 
1513   bind(GATHER8_LOOP);
1514     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1515     if (mask == noreg) {
1516       vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc);
1517     } else {
1518       vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc);
1519     }
1520     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1521     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1522     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1523     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1524     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1525     vpor(dst, dst, temp_dst, vlen_enc);
1526     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1527     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1528     jcc(Assembler::notEqual, GATHER8_LOOP);
1529 }
1530 
1531 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1532   switch(typ) {
1533     case T_INT:
1534       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1535       break;
1536     case T_FLOAT:
1537       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1538       break;
1539     case T_LONG:
1540       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1541       break;
1542     case T_DOUBLE:
1543       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1544       break;
1545     default:
1546       assert(false,"Should not reach here.");
1547       break;
1548   }
1549 }
1550 
1551 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1552   switch(typ) {
1553     case T_INT:
1554       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1555       break;
1556     case T_FLOAT:
1557       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1558       break;
1559     case T_LONG:
1560       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1561       break;
1562     case T_DOUBLE:
1563       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1564       break;
1565     default:
1566       assert(false,"Should not reach here.");
1567       break;
1568   }
1569 }
1570 
1571 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1572   switch(typ) {
1573     case T_INT:
1574       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1575       break;
1576     case T_FLOAT:
1577       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1578       break;
1579     case T_LONG:
1580       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1581       break;
1582     case T_DOUBLE:
1583       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1584       break;
1585     default:
1586       assert(false,"Should not reach here.");
1587       break;
1588   }
1589 }
1590 
1591 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1592   if (vlen_in_bytes <= 16) {
1593     pxor (dst, dst);
1594     psubb(dst, src);
1595     switch (elem_bt) {
1596       case T_BYTE:   /* nothing to do */ break;
1597       case T_SHORT:  pmovsxbw(dst, dst); break;
1598       case T_INT:    pmovsxbd(dst, dst); break;
1599       case T_FLOAT:  pmovsxbd(dst, dst); break;
1600       case T_LONG:   pmovsxbq(dst, dst); break;
1601       case T_DOUBLE: pmovsxbq(dst, dst); break;
1602 
1603       default: assert(false, "%s", type2name(elem_bt));
1604     }
1605   } else {
1606     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1607     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1608 
1609     vpxor (dst, dst, dst, vlen_enc);
1610     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1611 
1612     switch (elem_bt) {
1613       case T_BYTE:   /* nothing to do */            break;
1614       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1615       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1616       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1617       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1618       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1619 
1620       default: assert(false, "%s", type2name(elem_bt));
1621     }
1622   }
1623 }
1624 
1625 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1626   if (novlbwdq) {
1627     vpmovsxbd(xtmp, src, vlen_enc);
1628     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1629             Assembler::eq, true, vlen_enc, noreg);
1630   } else {
1631     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1632     vpsubb(xtmp, xtmp, src, vlen_enc);
1633     evpmovb2m(dst, xtmp, vlen_enc);
1634   }
1635 }
1636 
1637 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1638   if (is_integral_type(bt)) {
1639     switch (vlen_in_bytes) {
1640       case 4:  movdl(dst, src);   break;
1641       case 8:  movq(dst, src);    break;
1642       case 16: movdqu(dst, src);  break;
1643       case 32: vmovdqu(dst, src); break;
1644       case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1645       default: ShouldNotReachHere();
1646     }
1647   } else {
1648     switch (vlen_in_bytes) {
1649       case 4:  movflt(dst, src); break;
1650       case 8:  movdbl(dst, src); break;
1651       case 16: movups(dst, src); break;
1652       case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1653       case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1654       default: ShouldNotReachHere();
1655     }
1656   }
1657 }
1658 
1659 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1660   assert(rscratch != noreg || always_reachable(src), "missing");
1661 
1662   if (reachable(src)) {
1663     load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1664   } else {
1665     lea(rscratch, src);
1666     load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1667   }
1668 }
1669 
1670 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1671   int vlen_enc = vector_length_encoding(vlen);
1672   if (VM_Version::supports_avx()) {
1673     if (bt == T_LONG) {
1674       if (VM_Version::supports_avx2()) {
1675         vpbroadcastq(dst, src, vlen_enc);
1676       } else {
1677         vmovddup(dst, src, vlen_enc);
1678       }
1679     } else if (bt == T_DOUBLE) {
1680       if (vlen_enc != Assembler::AVX_128bit) {
1681         vbroadcastsd(dst, src, vlen_enc, noreg);
1682       } else {
1683         vmovddup(dst, src, vlen_enc);
1684       }
1685     } else {
1686       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1687         vpbroadcastd(dst, src, vlen_enc);
1688       } else {
1689         vbroadcastss(dst, src, vlen_enc);
1690       }
1691     }
1692   } else if (VM_Version::supports_sse3()) {
1693     movddup(dst, src);
1694   } else {
1695     load_vector(bt, dst, src, vlen);
1696   }
1697 }
1698 
1699 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1700   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1701   int offset = exact_log2(type2aelembytes(bt)) << 6;
1702   if (is_floating_point_type(bt)) {
1703     offset += 128;
1704   }
1705   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1706   load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1707 }
1708 
1709 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1710 
1711 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1712   int vector_len = Assembler::AVX_128bit;
1713 
1714   switch (opcode) {
1715     case Op_AndReductionV:  pand(dst, src); break;
1716     case Op_OrReductionV:   por (dst, src); break;
1717     case Op_XorReductionV:  pxor(dst, src); break;
1718     case Op_MinReductionV:
1719       switch (typ) {
1720         case T_BYTE:        pminsb(dst, src); break;
1721         case T_SHORT:       pminsw(dst, src); break;
1722         case T_INT:         pminsd(dst, src); break;
1723         case T_LONG:        assert(UseAVX > 2, "required");
1724                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1725         default:            assert(false, "wrong type");
1726       }
1727       break;
1728     case Op_MaxReductionV:
1729       switch (typ) {
1730         case T_BYTE:        pmaxsb(dst, src); break;
1731         case T_SHORT:       pmaxsw(dst, src); break;
1732         case T_INT:         pmaxsd(dst, src); break;
1733         case T_LONG:        assert(UseAVX > 2, "required");
1734                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1735         default:            assert(false, "wrong type");
1736       }
1737       break;
1738     case Op_AddReductionVF: addss(dst, src); break;
1739     case Op_AddReductionVD: addsd(dst, src); break;
1740     case Op_AddReductionVI:
1741       switch (typ) {
1742         case T_BYTE:        paddb(dst, src); break;
1743         case T_SHORT:       paddw(dst, src); break;
1744         case T_INT:         paddd(dst, src); break;
1745         default:            assert(false, "wrong type");
1746       }
1747       break;
1748     case Op_AddReductionVL: paddq(dst, src); break;
1749     case Op_MulReductionVF: mulss(dst, src); break;
1750     case Op_MulReductionVD: mulsd(dst, src); break;
1751     case Op_MulReductionVI:
1752       switch (typ) {
1753         case T_SHORT:       pmullw(dst, src); break;
1754         case T_INT:         pmulld(dst, src); break;
1755         default:            assert(false, "wrong type");
1756       }
1757       break;
1758     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1759                             evpmullq(dst, dst, src, vector_len); break;
1760     default:                assert(false, "wrong opcode");
1761   }
1762 }
1763 
1764 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1765   switch (opcode) {
1766     case Op_AddReductionVF: addps(dst, src); break;
1767     case Op_AddReductionVD: addpd(dst, src); break;
1768     case Op_MulReductionVF: mulps(dst, src); break;
1769     case Op_MulReductionVD: mulpd(dst, src); break;
1770     default:                assert(false, "%s", NodeClassNames[opcode]);
1771   }
1772 }
1773 
1774 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1775   int vector_len = Assembler::AVX_256bit;
1776 
1777   switch (opcode) {
1778     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1779     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1780     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1781     case Op_MinReductionV:
1782       switch (typ) {
1783         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1784         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1785         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1786         case T_LONG:        assert(UseAVX > 2, "required");
1787                             vpminsq(dst, src1, src2, vector_len); break;
1788         default:            assert(false, "wrong type");
1789       }
1790       break;
1791     case Op_MaxReductionV:
1792       switch (typ) {
1793         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1794         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1795         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1796         case T_LONG:        assert(UseAVX > 2, "required");
1797                             vpmaxsq(dst, src1, src2, vector_len); break;
1798         default:            assert(false, "wrong type");
1799       }
1800       break;
1801     case Op_AddReductionVI:
1802       switch (typ) {
1803         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1804         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1805         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1806         default:            assert(false, "wrong type");
1807       }
1808       break;
1809     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1810     case Op_MulReductionVI:
1811       switch (typ) {
1812         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1813         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1814         default:            assert(false, "wrong type");
1815       }
1816       break;
1817     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1818     default:                assert(false, "wrong opcode");
1819   }
1820 }
1821 
1822 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1823   int vector_len = Assembler::AVX_256bit;
1824 
1825   switch (opcode) {
1826     case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1827     case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1828     case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1829     case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1830     default:                assert(false, "%s", NodeClassNames[opcode]);
1831   }
1832 }
1833 
1834 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1835                                   XMMRegister dst, XMMRegister src,
1836                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1837   switch (opcode) {
1838     case Op_AddReductionVF:
1839     case Op_MulReductionVF:
1840       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1841       break;
1842 
1843     case Op_AddReductionVD:
1844     case Op_MulReductionVD:
1845       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1846       break;
1847 
1848     default: assert(false, "wrong opcode");
1849   }
1850 }
1851 
1852 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1853                                             XMMRegister dst, XMMRegister src,
1854                                             XMMRegister vtmp1, XMMRegister vtmp2) {
1855   switch (opcode) {
1856     case Op_AddReductionVF:
1857     case Op_MulReductionVF:
1858       unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1859       break;
1860 
1861     case Op_AddReductionVD:
1862     case Op_MulReductionVD:
1863       unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1864       break;
1865 
1866     default: assert(false, "%s", NodeClassNames[opcode]);
1867   }
1868 }
1869 
1870 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1871                              Register dst, Register src1, XMMRegister src2,
1872                              XMMRegister vtmp1, XMMRegister vtmp2) {
1873   switch (vlen) {
1874     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1875     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1876     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1877     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1878 
1879     default: assert(false, "wrong vector length");
1880   }
1881 }
1882 
1883 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1884                              Register dst, Register src1, XMMRegister src2,
1885                              XMMRegister vtmp1, XMMRegister vtmp2) {
1886   switch (vlen) {
1887     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1888     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1889     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1890     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1891 
1892     default: assert(false, "wrong vector length");
1893   }
1894 }
1895 
1896 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1897                              Register dst, Register src1, XMMRegister src2,
1898                              XMMRegister vtmp1, XMMRegister vtmp2) {
1899   switch (vlen) {
1900     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1901     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1902     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1903     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1904 
1905     default: assert(false, "wrong vector length");
1906   }
1907 }
1908 
1909 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1910                              Register dst, Register src1, XMMRegister src2,
1911                              XMMRegister vtmp1, XMMRegister vtmp2) {
1912   switch (vlen) {
1913     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1914     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1915     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1916     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1917 
1918     default: assert(false, "wrong vector length");
1919   }
1920 }
1921 
1922 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1923                              Register dst, Register src1, XMMRegister src2,
1924                              XMMRegister vtmp1, XMMRegister vtmp2) {
1925   switch (vlen) {
1926     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1927     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1928     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1929 
1930     default: assert(false, "wrong vector length");
1931   }
1932 }
1933 
1934 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1935   switch (vlen) {
1936     case 2:
1937       assert(vtmp2 == xnoreg, "");
1938       reduce2F(opcode, dst, src, vtmp1);
1939       break;
1940     case 4:
1941       assert(vtmp2 == xnoreg, "");
1942       reduce4F(opcode, dst, src, vtmp1);
1943       break;
1944     case 8:
1945       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1946       break;
1947     case 16:
1948       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1949       break;
1950     default: assert(false, "wrong vector length");
1951   }
1952 }
1953 
1954 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1955   switch (vlen) {
1956     case 2:
1957       assert(vtmp2 == xnoreg, "");
1958       reduce2D(opcode, dst, src, vtmp1);
1959       break;
1960     case 4:
1961       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1962       break;
1963     case 8:
1964       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1965       break;
1966     default: assert(false, "wrong vector length");
1967   }
1968 }
1969 
1970 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1971   switch (vlen) {
1972     case 2:
1973       assert(vtmp1 == xnoreg, "");
1974       assert(vtmp2 == xnoreg, "");
1975       unorderedReduce2F(opcode, dst, src);
1976       break;
1977     case 4:
1978       assert(vtmp2 == xnoreg, "");
1979       unorderedReduce4F(opcode, dst, src, vtmp1);
1980       break;
1981     case 8:
1982       unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
1983       break;
1984     case 16:
1985       unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
1986       break;
1987     default: assert(false, "wrong vector length");
1988   }
1989 }
1990 
1991 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1992   switch (vlen) {
1993     case 2:
1994       assert(vtmp1 == xnoreg, "");
1995       assert(vtmp2 == xnoreg, "");
1996       unorderedReduce2D(opcode, dst, src);
1997       break;
1998     case 4:
1999       assert(vtmp2 == xnoreg, "");
2000       unorderedReduce4D(opcode, dst, src, vtmp1);
2001       break;
2002     case 8:
2003       unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2004       break;
2005     default: assert(false, "wrong vector length");
2006   }
2007 }
2008 
2009 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2010   if (opcode == Op_AddReductionVI) {
2011     if (vtmp1 != src2) {
2012       movdqu(vtmp1, src2);
2013     }
2014     phaddd(vtmp1, vtmp1);
2015   } else {
2016     pshufd(vtmp1, src2, 0x1);
2017     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2018   }
2019   movdl(vtmp2, src1);
2020   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2021   movdl(dst, vtmp1);
2022 }
2023 
2024 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2025   if (opcode == Op_AddReductionVI) {
2026     if (vtmp1 != src2) {
2027       movdqu(vtmp1, src2);
2028     }
2029     phaddd(vtmp1, src2);
2030     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2031   } else {
2032     pshufd(vtmp2, src2, 0xE);
2033     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2034     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2035   }
2036 }
2037 
2038 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2039   if (opcode == Op_AddReductionVI) {
2040     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2041     vextracti128_high(vtmp2, vtmp1);
2042     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2043     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2044   } else {
2045     vextracti128_high(vtmp1, src2);
2046     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2047     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2048   }
2049 }
2050 
2051 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2052   vextracti64x4_high(vtmp2, src2);
2053   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2054   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2055 }
2056 
2057 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2058   pshufd(vtmp2, src2, 0x1);
2059   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2060   movdqu(vtmp1, vtmp2);
2061   psrldq(vtmp1, 2);
2062   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2063   movdqu(vtmp2, vtmp1);
2064   psrldq(vtmp2, 1);
2065   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2066   movdl(vtmp2, src1);
2067   pmovsxbd(vtmp1, vtmp1);
2068   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2069   pextrb(dst, vtmp1, 0x0);
2070   movsbl(dst, dst);
2071 }
2072 
2073 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2074   pshufd(vtmp1, src2, 0xE);
2075   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2076   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2077 }
2078 
2079 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2080   vextracti128_high(vtmp2, src2);
2081   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2082   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2083 }
2084 
2085 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2086   vextracti64x4_high(vtmp1, src2);
2087   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2088   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2089 }
2090 
2091 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2092   pmovsxbw(vtmp2, src2);
2093   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2094 }
2095 
2096 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2097   if (UseAVX > 1) {
2098     int vector_len = Assembler::AVX_256bit;
2099     vpmovsxbw(vtmp1, src2, vector_len);
2100     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2101   } else {
2102     pmovsxbw(vtmp2, src2);
2103     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2104     pshufd(vtmp2, src2, 0x1);
2105     pmovsxbw(vtmp2, src2);
2106     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2107   }
2108 }
2109 
2110 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2111   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2112     int vector_len = Assembler::AVX_512bit;
2113     vpmovsxbw(vtmp1, src2, vector_len);
2114     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2115   } else {
2116     assert(UseAVX >= 2,"Should not reach here.");
2117     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2118     vextracti128_high(vtmp2, src2);
2119     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2120   }
2121 }
2122 
2123 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2124   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2125   vextracti64x4_high(vtmp2, src2);
2126   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2127 }
2128 
2129 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2130   if (opcode == Op_AddReductionVI) {
2131     if (vtmp1 != src2) {
2132       movdqu(vtmp1, src2);
2133     }
2134     phaddw(vtmp1, vtmp1);
2135     phaddw(vtmp1, vtmp1);
2136   } else {
2137     pshufd(vtmp2, src2, 0x1);
2138     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2139     movdqu(vtmp1, vtmp2);
2140     psrldq(vtmp1, 2);
2141     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2142   }
2143   movdl(vtmp2, src1);
2144   pmovsxwd(vtmp1, vtmp1);
2145   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2146   pextrw(dst, vtmp1, 0x0);
2147   movswl(dst, dst);
2148 }
2149 
2150 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2151   if (opcode == Op_AddReductionVI) {
2152     if (vtmp1 != src2) {
2153       movdqu(vtmp1, src2);
2154     }
2155     phaddw(vtmp1, src2);
2156   } else {
2157     pshufd(vtmp1, src2, 0xE);
2158     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2159   }
2160   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2161 }
2162 
2163 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2164   if (opcode == Op_AddReductionVI) {
2165     int vector_len = Assembler::AVX_256bit;
2166     vphaddw(vtmp2, src2, src2, vector_len);
2167     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2168   } else {
2169     vextracti128_high(vtmp2, src2);
2170     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2171   }
2172   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2173 }
2174 
2175 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2176   int vector_len = Assembler::AVX_256bit;
2177   vextracti64x4_high(vtmp1, src2);
2178   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2179   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2180 }
2181 
2182 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2183   pshufd(vtmp2, src2, 0xE);
2184   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2185   movdq(vtmp1, src1);
2186   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2187   movdq(dst, vtmp1);
2188 }
2189 
2190 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2191   vextracti128_high(vtmp1, src2);
2192   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2193   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2194 }
2195 
2196 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2197   vextracti64x4_high(vtmp2, src2);
2198   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2199   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2200 }
2201 
2202 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2203   mov64(temp, -1L);
2204   bzhiq(temp, temp, len);
2205   kmovql(dst, temp);
2206 }
2207 
2208 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2209   reduce_operation_128(T_FLOAT, opcode, dst, src);
2210   pshufd(vtmp, src, 0x1);
2211   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2212 }
2213 
2214 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2215   reduce2F(opcode, dst, src, vtmp);
2216   pshufd(vtmp, src, 0x2);
2217   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2218   pshufd(vtmp, src, 0x3);
2219   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2220 }
2221 
2222 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2223   reduce4F(opcode, dst, src, vtmp2);
2224   vextractf128_high(vtmp2, src);
2225   reduce4F(opcode, dst, vtmp2, vtmp1);
2226 }
2227 
2228 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2229   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2230   vextracti64x4_high(vtmp1, src);
2231   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2232 }
2233 
2234 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2235   pshufd(dst, src, 0x1);
2236   reduce_operation_128(T_FLOAT, opcode, dst, src);
2237 }
2238 
2239 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2240   pshufd(vtmp, src, 0xE);
2241   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2242   unorderedReduce2F(opcode, dst, vtmp);
2243 }
2244 
2245 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2246   vextractf128_high(vtmp1, src);
2247   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2248   unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2249 }
2250 
2251 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2252   vextractf64x4_high(vtmp2, src);
2253   unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2254   unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2255 }
2256 
2257 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2258   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2259   pshufd(vtmp, src, 0xE);
2260   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2261 }
2262 
2263 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2264   reduce2D(opcode, dst, src, vtmp2);
2265   vextractf128_high(vtmp2, src);
2266   reduce2D(opcode, dst, vtmp2, vtmp1);
2267 }
2268 
2269 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2270   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2271   vextracti64x4_high(vtmp1, src);
2272   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2273 }
2274 
2275 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2276   pshufd(dst, src, 0xE);
2277   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2278 }
2279 
2280 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2281   vextractf128_high(vtmp, src);
2282   unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2283   unorderedReduce2D(opcode, dst, vtmp);
2284 }
2285 
2286 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2287   vextractf64x4_high(vtmp2, src);
2288   unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2289   unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2290 }
2291 
2292 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2293   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2294 }
2295 
2296 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2297   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2298 }
2299 
2300 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2301   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2302 }
2303 
2304 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2305                                  int vec_enc) {
2306   switch(elem_bt) {
2307     case T_INT:
2308     case T_FLOAT:
2309       vmaskmovps(dst, src, mask, vec_enc);
2310       break;
2311     case T_LONG:
2312     case T_DOUBLE:
2313       vmaskmovpd(dst, src, mask, vec_enc);
2314       break;
2315     default:
2316       fatal("Unsupported type %s", type2name(elem_bt));
2317       break;
2318   }
2319 }
2320 
2321 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2322                                  int vec_enc) {
2323   switch(elem_bt) {
2324     case T_INT:
2325     case T_FLOAT:
2326       vmaskmovps(dst, src, mask, vec_enc);
2327       break;
2328     case T_LONG:
2329     case T_DOUBLE:
2330       vmaskmovpd(dst, src, mask, vec_enc);
2331       break;
2332     default:
2333       fatal("Unsupported type %s", type2name(elem_bt));
2334       break;
2335   }
2336 }
2337 
2338 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2339                                           XMMRegister dst, XMMRegister src,
2340                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2341                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2342   const int permconst[] = {1, 14};
2343   XMMRegister wsrc = src;
2344   XMMRegister wdst = xmm_0;
2345   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2346 
2347   int vlen_enc = Assembler::AVX_128bit;
2348   if (vlen == 16) {
2349     vlen_enc = Assembler::AVX_256bit;
2350   }
2351 
2352   for (int i = log2(vlen) - 1; i >=0; i--) {
2353     if (i == 0 && !is_dst_valid) {
2354       wdst = dst;
2355     }
2356     if (i == 3) {
2357       vextracti64x4_high(wtmp, wsrc);
2358     } else if (i == 2) {
2359       vextracti128_high(wtmp, wsrc);
2360     } else { // i = [0,1]
2361       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2362     }
2363 
2364     if (VM_Version::supports_avx10_2()) {
2365       vminmax_fp(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc);
2366     } else {
2367       vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2368     }
2369     wsrc = wdst;
2370     vlen_enc = Assembler::AVX_128bit;
2371   }
2372   if (is_dst_valid) {
2373     if (VM_Version::supports_avx10_2()) {
2374       vminmax_fp(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit);
2375     } else {
2376       vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2377     }
2378   }
2379 }
2380 
2381 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2382                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2383                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2384   XMMRegister wsrc = src;
2385   XMMRegister wdst = xmm_0;
2386   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2387   int vlen_enc = Assembler::AVX_128bit;
2388   if (vlen == 8) {
2389     vlen_enc = Assembler::AVX_256bit;
2390   }
2391   for (int i = log2(vlen) - 1; i >=0; i--) {
2392     if (i == 0 && !is_dst_valid) {
2393       wdst = dst;
2394     }
2395     if (i == 1) {
2396       vextracti128_high(wtmp, wsrc);
2397     } else if (i == 2) {
2398       vextracti64x4_high(wtmp, wsrc);
2399     } else {
2400       assert(i == 0, "%d", i);
2401       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2402     }
2403 
2404     if (VM_Version::supports_avx10_2()) {
2405       vminmax_fp(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc);
2406     } else {
2407       vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2408     }
2409 
2410     wsrc = wdst;
2411     vlen_enc = Assembler::AVX_128bit;
2412   }
2413 
2414   if (is_dst_valid) {
2415     if (VM_Version::supports_avx10_2()) {
2416       vminmax_fp(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit);
2417     } else {
2418       vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2419     }
2420   }
2421 }
2422 
2423 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2424   switch (bt) {
2425     case T_BYTE:  pextrb(dst, src, idx); break;
2426     case T_SHORT: pextrw(dst, src, idx); break;
2427     case T_INT:   pextrd(dst, src, idx); break;
2428     case T_LONG:  pextrq(dst, src, idx); break;
2429 
2430     default:
2431       assert(false,"Should not reach here.");
2432       break;
2433   }
2434 }
2435 
2436 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2437   int esize =  type2aelembytes(typ);
2438   int elem_per_lane = 16/esize;
2439   int lane = elemindex / elem_per_lane;
2440   int eindex = elemindex % elem_per_lane;
2441 
2442   if (lane >= 2) {
2443     assert(UseAVX > 2, "required");
2444     vextractf32x4(dst, src, lane & 3);
2445     return dst;
2446   } else if (lane > 0) {
2447     assert(UseAVX > 0, "required");
2448     vextractf128(dst, src, lane);
2449     return dst;
2450   } else {
2451     return src;
2452   }
2453 }
2454 
2455 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2456   if (typ == T_BYTE) {
2457     movsbl(dst, dst);
2458   } else if (typ == T_SHORT) {
2459     movswl(dst, dst);
2460   }
2461 }
2462 
2463 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2464   int esize =  type2aelembytes(typ);
2465   int elem_per_lane = 16/esize;
2466   int eindex = elemindex % elem_per_lane;
2467   assert(is_integral_type(typ),"required");
2468 
2469   if (eindex == 0) {
2470     if (typ == T_LONG) {
2471       movq(dst, src);
2472     } else {
2473       movdl(dst, src);
2474       movsxl(typ, dst);
2475     }
2476   } else {
2477     extract(typ, dst, src, eindex);
2478     movsxl(typ, dst);
2479   }
2480 }
2481 
2482 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2483   int esize =  type2aelembytes(typ);
2484   int elem_per_lane = 16/esize;
2485   int eindex = elemindex % elem_per_lane;
2486   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2487 
2488   if (eindex == 0) {
2489     movq(dst, src);
2490   } else {
2491     if (typ == T_FLOAT) {
2492       if (UseAVX == 0) {
2493         movdqu(dst, src);
2494         shufps(dst, dst, eindex);
2495       } else {
2496         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2497       }
2498     } else {
2499       if (UseAVX == 0) {
2500         movdqu(dst, src);
2501         psrldq(dst, eindex*esize);
2502       } else {
2503         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2504       }
2505       movq(dst, dst);
2506     }
2507   }
2508   // Zero upper bits
2509   if (typ == T_FLOAT) {
2510     if (UseAVX == 0) {
2511       assert(vtmp != xnoreg, "required.");
2512       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2513       pand(dst, vtmp);
2514     } else {
2515       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2516     }
2517   }
2518 }
2519 
2520 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2521   switch(typ) {
2522     case T_BYTE:
2523     case T_BOOLEAN:
2524       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2525       break;
2526     case T_SHORT:
2527     case T_CHAR:
2528       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2529       break;
2530     case T_INT:
2531     case T_FLOAT:
2532       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2533       break;
2534     case T_LONG:
2535     case T_DOUBLE:
2536       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2537       break;
2538     default:
2539       assert(false,"Should not reach here.");
2540       break;
2541   }
2542 }
2543 
2544 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2545   assert(rscratch != noreg || always_reachable(src2), "missing");
2546 
2547   switch(typ) {
2548     case T_BOOLEAN:
2549     case T_BYTE:
2550       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2551       break;
2552     case T_CHAR:
2553     case T_SHORT:
2554       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2555       break;
2556     case T_INT:
2557     case T_FLOAT:
2558       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2559       break;
2560     case T_LONG:
2561     case T_DOUBLE:
2562       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2563       break;
2564     default:
2565       assert(false,"Should not reach here.");
2566       break;
2567   }
2568 }
2569 
2570 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2571   switch(typ) {
2572     case T_BYTE:
2573       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2574       break;
2575     case T_SHORT:
2576       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2577       break;
2578     case T_INT:
2579     case T_FLOAT:
2580       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2581       break;
2582     case T_LONG:
2583     case T_DOUBLE:
2584       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2585       break;
2586     default:
2587       assert(false,"Should not reach here.");
2588       break;
2589   }
2590 }
2591 
2592 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2593   assert(vlen_in_bytes <= 32, "");
2594   int esize = type2aelembytes(bt);
2595   if (vlen_in_bytes == 32) {
2596     assert(vtmp == xnoreg, "required.");
2597     if (esize >= 4) {
2598       vtestps(src1, src2, AVX_256bit);
2599     } else {
2600       vptest(src1, src2, AVX_256bit);
2601     }
2602     return;
2603   }
2604   if (vlen_in_bytes < 16) {
2605     // Duplicate the lower part to fill the whole register,
2606     // Don't need to do so for src2
2607     assert(vtmp != xnoreg, "required");
2608     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2609     pshufd(vtmp, src1, shuffle_imm);
2610   } else {
2611     assert(vtmp == xnoreg, "required");
2612     vtmp = src1;
2613   }
2614   if (esize >= 4 && VM_Version::supports_avx()) {
2615     vtestps(vtmp, src2, AVX_128bit);
2616   } else {
2617     ptest(vtmp, src2);
2618   }
2619 }
2620 
2621 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2622 #ifdef ASSERT
2623   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2624   bool is_bw_supported = VM_Version::supports_avx512bw();
2625   if (is_bw && !is_bw_supported) {
2626     assert(vlen_enc != Assembler::AVX_512bit, "required");
2627     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2628            "XMM register should be 0-15");
2629   }
2630 #endif // ASSERT
2631   switch (elem_bt) {
2632     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2633     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2634     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2635     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2636     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2637     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2638     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2639   }
2640 }
2641 
2642 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2643   assert(UseAVX >= 2, "required");
2644   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2645   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2646   if ((UseAVX > 2) &&
2647       (!is_bw || VM_Version::supports_avx512bw()) &&
2648       (!is_vl || VM_Version::supports_avx512vl())) {
2649     switch (elem_bt) {
2650       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2651       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2652       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2653       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2654       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2655     }
2656   } else {
2657     assert(vlen_enc != Assembler::AVX_512bit, "required");
2658     assert((dst->encoding() < 16),"XMM register should be 0-15");
2659     switch (elem_bt) {
2660       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2661       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2662       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2663       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2664       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2665       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2666       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2667     }
2668   }
2669 }
2670 
2671 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2672   switch (to_elem_bt) {
2673     case T_SHORT:
2674       vpmovsxbw(dst, src, vlen_enc);
2675       break;
2676     case T_INT:
2677       vpmovsxbd(dst, src, vlen_enc);
2678       break;
2679     case T_FLOAT:
2680       vpmovsxbd(dst, src, vlen_enc);
2681       vcvtdq2ps(dst, dst, vlen_enc);
2682       break;
2683     case T_LONG:
2684       vpmovsxbq(dst, src, vlen_enc);
2685       break;
2686     case T_DOUBLE: {
2687       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2688       vpmovsxbd(dst, src, mid_vlen_enc);
2689       vcvtdq2pd(dst, dst, vlen_enc);
2690       break;
2691     }
2692     default:
2693       fatal("Unsupported type %s", type2name(to_elem_bt));
2694       break;
2695   }
2696 }
2697 
2698 //-------------------------------------------------------------------------------------------
2699 
2700 // IndexOf for constant substrings with size >= 8 chars
2701 // which don't need to be loaded through stack.
2702 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2703                                          Register cnt1, Register cnt2,
2704                                          int int_cnt2,  Register result,
2705                                          XMMRegister vec, Register tmp,
2706                                          int ae) {
2707   ShortBranchVerifier sbv(this);
2708   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2709   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2710 
2711   // This method uses the pcmpestri instruction with bound registers
2712   //   inputs:
2713   //     xmm - substring
2714   //     rax - substring length (elements count)
2715   //     mem - scanned string
2716   //     rdx - string length (elements count)
2717   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2718   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2719   //   outputs:
2720   //     rcx - matched index in string
2721   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2722   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2723   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2724   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2725   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2726 
2727   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2728         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2729         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2730 
2731   // Note, inline_string_indexOf() generates checks:
2732   // if (substr.count > string.count) return -1;
2733   // if (substr.count == 0) return 0;
2734   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2735 
2736   // Load substring.
2737   if (ae == StrIntrinsicNode::UL) {
2738     pmovzxbw(vec, Address(str2, 0));
2739   } else {
2740     movdqu(vec, Address(str2, 0));
2741   }
2742   movl(cnt2, int_cnt2);
2743   movptr(result, str1); // string addr
2744 
2745   if (int_cnt2 > stride) {
2746     jmpb(SCAN_TO_SUBSTR);
2747 
2748     // Reload substr for rescan, this code
2749     // is executed only for large substrings (> 8 chars)
2750     bind(RELOAD_SUBSTR);
2751     if (ae == StrIntrinsicNode::UL) {
2752       pmovzxbw(vec, Address(str2, 0));
2753     } else {
2754       movdqu(vec, Address(str2, 0));
2755     }
2756     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2757 
2758     bind(RELOAD_STR);
2759     // We came here after the beginning of the substring was
2760     // matched but the rest of it was not so we need to search
2761     // again. Start from the next element after the previous match.
2762 
2763     // cnt2 is number of substring reminding elements and
2764     // cnt1 is number of string reminding elements when cmp failed.
2765     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2766     subl(cnt1, cnt2);
2767     addl(cnt1, int_cnt2);
2768     movl(cnt2, int_cnt2); // Now restore cnt2
2769 
2770     decrementl(cnt1);     // Shift to next element
2771     cmpl(cnt1, cnt2);
2772     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2773 
2774     addptr(result, (1<<scale1));
2775 
2776   } // (int_cnt2 > 8)
2777 
2778   // Scan string for start of substr in 16-byte vectors
2779   bind(SCAN_TO_SUBSTR);
2780   pcmpestri(vec, Address(result, 0), mode);
2781   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2782   subl(cnt1, stride);
2783   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2784   cmpl(cnt1, cnt2);
2785   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2786   addptr(result, 16);
2787   jmpb(SCAN_TO_SUBSTR);
2788 
2789   // Found a potential substr
2790   bind(FOUND_CANDIDATE);
2791   // Matched whole vector if first element matched (tmp(rcx) == 0).
2792   if (int_cnt2 == stride) {
2793     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2794   } else { // int_cnt2 > 8
2795     jccb(Assembler::overflow, FOUND_SUBSTR);
2796   }
2797   // After pcmpestri tmp(rcx) contains matched element index
2798   // Compute start addr of substr
2799   lea(result, Address(result, tmp, scale1));
2800 
2801   // Make sure string is still long enough
2802   subl(cnt1, tmp);
2803   cmpl(cnt1, cnt2);
2804   if (int_cnt2 == stride) {
2805     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2806   } else { // int_cnt2 > 8
2807     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2808   }
2809   // Left less then substring.
2810 
2811   bind(RET_NOT_FOUND);
2812   movl(result, -1);
2813   jmp(EXIT);
2814 
2815   if (int_cnt2 > stride) {
2816     // This code is optimized for the case when whole substring
2817     // is matched if its head is matched.
2818     bind(MATCH_SUBSTR_HEAD);
2819     pcmpestri(vec, Address(result, 0), mode);
2820     // Reload only string if does not match
2821     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2822 
2823     Label CONT_SCAN_SUBSTR;
2824     // Compare the rest of substring (> 8 chars).
2825     bind(FOUND_SUBSTR);
2826     // First 8 chars are already matched.
2827     negptr(cnt2);
2828     addptr(cnt2, stride);
2829 
2830     bind(SCAN_SUBSTR);
2831     subl(cnt1, stride);
2832     cmpl(cnt2, -stride); // Do not read beyond substring
2833     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2834     // Back-up strings to avoid reading beyond substring:
2835     // cnt1 = cnt1 - cnt2 + 8
2836     addl(cnt1, cnt2); // cnt2 is negative
2837     addl(cnt1, stride);
2838     movl(cnt2, stride); negptr(cnt2);
2839     bind(CONT_SCAN_SUBSTR);
2840     if (int_cnt2 < (int)G) {
2841       int tail_off1 = int_cnt2<<scale1;
2842       int tail_off2 = int_cnt2<<scale2;
2843       if (ae == StrIntrinsicNode::UL) {
2844         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2845       } else {
2846         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2847       }
2848       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2849     } else {
2850       // calculate index in register to avoid integer overflow (int_cnt2*2)
2851       movl(tmp, int_cnt2);
2852       addptr(tmp, cnt2);
2853       if (ae == StrIntrinsicNode::UL) {
2854         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2855       } else {
2856         movdqu(vec, Address(str2, tmp, scale2, 0));
2857       }
2858       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2859     }
2860     // Need to reload strings pointers if not matched whole vector
2861     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2862     addptr(cnt2, stride);
2863     jcc(Assembler::negative, SCAN_SUBSTR);
2864     // Fall through if found full substring
2865 
2866   } // (int_cnt2 > 8)
2867 
2868   bind(RET_FOUND);
2869   // Found result if we matched full small substring.
2870   // Compute substr offset
2871   subptr(result, str1);
2872   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2873     shrl(result, 1); // index
2874   }
2875   bind(EXIT);
2876 
2877 } // string_indexofC8
2878 
2879 // Small strings are loaded through stack if they cross page boundary.
2880 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2881                                        Register cnt1, Register cnt2,
2882                                        int int_cnt2,  Register result,
2883                                        XMMRegister vec, Register tmp,
2884                                        int ae) {
2885   ShortBranchVerifier sbv(this);
2886   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2887   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2888 
2889   //
2890   // int_cnt2 is length of small (< 8 chars) constant substring
2891   // or (-1) for non constant substring in which case its length
2892   // is in cnt2 register.
2893   //
2894   // Note, inline_string_indexOf() generates checks:
2895   // if (substr.count > string.count) return -1;
2896   // if (substr.count == 0) return 0;
2897   //
2898   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2899   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2900   // This method uses the pcmpestri instruction with bound registers
2901   //   inputs:
2902   //     xmm - substring
2903   //     rax - substring length (elements count)
2904   //     mem - scanned string
2905   //     rdx - string length (elements count)
2906   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2907   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2908   //   outputs:
2909   //     rcx - matched index in string
2910   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2911   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2912   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2913   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2914 
2915   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2916         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2917         FOUND_CANDIDATE;
2918 
2919   { //========================================================
2920     // We don't know where these strings are located
2921     // and we can't read beyond them. Load them through stack.
2922     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2923 
2924     movptr(tmp, rsp); // save old SP
2925 
2926     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2927       if (int_cnt2 == (1>>scale2)) { // One byte
2928         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2929         load_unsigned_byte(result, Address(str2, 0));
2930         movdl(vec, result); // move 32 bits
2931       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2932         // Not enough header space in 32-bit VM: 12+3 = 15.
2933         movl(result, Address(str2, -1));
2934         shrl(result, 8);
2935         movdl(vec, result); // move 32 bits
2936       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2937         load_unsigned_short(result, Address(str2, 0));
2938         movdl(vec, result); // move 32 bits
2939       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2940         movdl(vec, Address(str2, 0)); // move 32 bits
2941       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2942         movq(vec, Address(str2, 0));  // move 64 bits
2943       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2944         // Array header size is 12 bytes in 32-bit VM
2945         // + 6 bytes for 3 chars == 18 bytes,
2946         // enough space to load vec and shift.
2947         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2948         if (ae == StrIntrinsicNode::UL) {
2949           int tail_off = int_cnt2-8;
2950           pmovzxbw(vec, Address(str2, tail_off));
2951           psrldq(vec, -2*tail_off);
2952         }
2953         else {
2954           int tail_off = int_cnt2*(1<<scale2);
2955           movdqu(vec, Address(str2, tail_off-16));
2956           psrldq(vec, 16-tail_off);
2957         }
2958       }
2959     } else { // not constant substring
2960       cmpl(cnt2, stride);
2961       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2962 
2963       // We can read beyond string if srt+16 does not cross page boundary
2964       // since heaps are aligned and mapped by pages.
2965       assert(os::vm_page_size() < (int)G, "default page should be small");
2966       movl(result, str2); // We need only low 32 bits
2967       andl(result, ((int)os::vm_page_size()-1));
2968       cmpl(result, ((int)os::vm_page_size()-16));
2969       jccb(Assembler::belowEqual, CHECK_STR);
2970 
2971       // Move small strings to stack to allow load 16 bytes into vec.
2972       subptr(rsp, 16);
2973       int stk_offset = wordSize-(1<<scale2);
2974       push(cnt2);
2975 
2976       bind(COPY_SUBSTR);
2977       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2978         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2979         movb(Address(rsp, cnt2, scale2, stk_offset), result);
2980       } else if (ae == StrIntrinsicNode::UU) {
2981         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2982         movw(Address(rsp, cnt2, scale2, stk_offset), result);
2983       }
2984       decrement(cnt2);
2985       jccb(Assembler::notZero, COPY_SUBSTR);
2986 
2987       pop(cnt2);
2988       movptr(str2, rsp);  // New substring address
2989     } // non constant
2990 
2991     bind(CHECK_STR);
2992     cmpl(cnt1, stride);
2993     jccb(Assembler::aboveEqual, BIG_STRINGS);
2994 
2995     // Check cross page boundary.
2996     movl(result, str1); // We need only low 32 bits
2997     andl(result, ((int)os::vm_page_size()-1));
2998     cmpl(result, ((int)os::vm_page_size()-16));
2999     jccb(Assembler::belowEqual, BIG_STRINGS);
3000 
3001     subptr(rsp, 16);
3002     int stk_offset = -(1<<scale1);
3003     if (int_cnt2 < 0) { // not constant
3004       push(cnt2);
3005       stk_offset += wordSize;
3006     }
3007     movl(cnt2, cnt1);
3008 
3009     bind(COPY_STR);
3010     if (ae == StrIntrinsicNode::LL) {
3011       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3012       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3013     } else {
3014       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3015       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3016     }
3017     decrement(cnt2);
3018     jccb(Assembler::notZero, COPY_STR);
3019 
3020     if (int_cnt2 < 0) { // not constant
3021       pop(cnt2);
3022     }
3023     movptr(str1, rsp);  // New string address
3024 
3025     bind(BIG_STRINGS);
3026     // Load substring.
3027     if (int_cnt2 < 0) { // -1
3028       if (ae == StrIntrinsicNode::UL) {
3029         pmovzxbw(vec, Address(str2, 0));
3030       } else {
3031         movdqu(vec, Address(str2, 0));
3032       }
3033       push(cnt2);       // substr count
3034       push(str2);       // substr addr
3035       push(str1);       // string addr
3036     } else {
3037       // Small (< 8 chars) constant substrings are loaded already.
3038       movl(cnt2, int_cnt2);
3039     }
3040     push(tmp);  // original SP
3041 
3042   } // Finished loading
3043 
3044   //========================================================
3045   // Start search
3046   //
3047 
3048   movptr(result, str1); // string addr
3049 
3050   if (int_cnt2  < 0) {  // Only for non constant substring
3051     jmpb(SCAN_TO_SUBSTR);
3052 
3053     // SP saved at sp+0
3054     // String saved at sp+1*wordSize
3055     // Substr saved at sp+2*wordSize
3056     // Substr count saved at sp+3*wordSize
3057 
3058     // Reload substr for rescan, this code
3059     // is executed only for large substrings (> 8 chars)
3060     bind(RELOAD_SUBSTR);
3061     movptr(str2, Address(rsp, 2*wordSize));
3062     movl(cnt2, Address(rsp, 3*wordSize));
3063     if (ae == StrIntrinsicNode::UL) {
3064       pmovzxbw(vec, Address(str2, 0));
3065     } else {
3066       movdqu(vec, Address(str2, 0));
3067     }
3068     // We came here after the beginning of the substring was
3069     // matched but the rest of it was not so we need to search
3070     // again. Start from the next element after the previous match.
3071     subptr(str1, result); // Restore counter
3072     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3073       shrl(str1, 1);
3074     }
3075     addl(cnt1, str1);
3076     decrementl(cnt1);   // Shift to next element
3077     cmpl(cnt1, cnt2);
3078     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3079 
3080     addptr(result, (1<<scale1));
3081   } // non constant
3082 
3083   // Scan string for start of substr in 16-byte vectors
3084   bind(SCAN_TO_SUBSTR);
3085   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3086   pcmpestri(vec, Address(result, 0), mode);
3087   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3088   subl(cnt1, stride);
3089   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3090   cmpl(cnt1, cnt2);
3091   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3092   addptr(result, 16);
3093 
3094   bind(ADJUST_STR);
3095   cmpl(cnt1, stride); // Do not read beyond string
3096   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3097   // Back-up string to avoid reading beyond string.
3098   lea(result, Address(result, cnt1, scale1, -16));
3099   movl(cnt1, stride);
3100   jmpb(SCAN_TO_SUBSTR);
3101 
3102   // Found a potential substr
3103   bind(FOUND_CANDIDATE);
3104   // After pcmpestri tmp(rcx) contains matched element index
3105 
3106   // Make sure string is still long enough
3107   subl(cnt1, tmp);
3108   cmpl(cnt1, cnt2);
3109   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3110   // Left less then substring.
3111 
3112   bind(RET_NOT_FOUND);
3113   movl(result, -1);
3114   jmp(CLEANUP);
3115 
3116   bind(FOUND_SUBSTR);
3117   // Compute start addr of substr
3118   lea(result, Address(result, tmp, scale1));
3119   if (int_cnt2 > 0) { // Constant substring
3120     // Repeat search for small substring (< 8 chars)
3121     // from new point without reloading substring.
3122     // Have to check that we don't read beyond string.
3123     cmpl(tmp, stride-int_cnt2);
3124     jccb(Assembler::greater, ADJUST_STR);
3125     // Fall through if matched whole substring.
3126   } else { // non constant
3127     assert(int_cnt2 == -1, "should be != 0");
3128 
3129     addl(tmp, cnt2);
3130     // Found result if we matched whole substring.
3131     cmpl(tmp, stride);
3132     jcc(Assembler::lessEqual, RET_FOUND);
3133 
3134     // Repeat search for small substring (<= 8 chars)
3135     // from new point 'str1' without reloading substring.
3136     cmpl(cnt2, stride);
3137     // Have to check that we don't read beyond string.
3138     jccb(Assembler::lessEqual, ADJUST_STR);
3139 
3140     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3141     // Compare the rest of substring (> 8 chars).
3142     movptr(str1, result);
3143 
3144     cmpl(tmp, cnt2);
3145     // First 8 chars are already matched.
3146     jccb(Assembler::equal, CHECK_NEXT);
3147 
3148     bind(SCAN_SUBSTR);
3149     pcmpestri(vec, Address(str1, 0), mode);
3150     // Need to reload strings pointers if not matched whole vector
3151     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3152 
3153     bind(CHECK_NEXT);
3154     subl(cnt2, stride);
3155     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3156     addptr(str1, 16);
3157     if (ae == StrIntrinsicNode::UL) {
3158       addptr(str2, 8);
3159     } else {
3160       addptr(str2, 16);
3161     }
3162     subl(cnt1, stride);
3163     cmpl(cnt2, stride); // Do not read beyond substring
3164     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3165     // Back-up strings to avoid reading beyond substring.
3166 
3167     if (ae == StrIntrinsicNode::UL) {
3168       lea(str2, Address(str2, cnt2, scale2, -8));
3169       lea(str1, Address(str1, cnt2, scale1, -16));
3170     } else {
3171       lea(str2, Address(str2, cnt2, scale2, -16));
3172       lea(str1, Address(str1, cnt2, scale1, -16));
3173     }
3174     subl(cnt1, cnt2);
3175     movl(cnt2, stride);
3176     addl(cnt1, stride);
3177     bind(CONT_SCAN_SUBSTR);
3178     if (ae == StrIntrinsicNode::UL) {
3179       pmovzxbw(vec, Address(str2, 0));
3180     } else {
3181       movdqu(vec, Address(str2, 0));
3182     }
3183     jmp(SCAN_SUBSTR);
3184 
3185     bind(RET_FOUND_LONG);
3186     movptr(str1, Address(rsp, wordSize));
3187   } // non constant
3188 
3189   bind(RET_FOUND);
3190   // Compute substr offset
3191   subptr(result, str1);
3192   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3193     shrl(result, 1); // index
3194   }
3195   bind(CLEANUP);
3196   pop(rsp); // restore SP
3197 
3198 } // string_indexof
3199 
3200 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3201                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3202   ShortBranchVerifier sbv(this);
3203   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3204 
3205   int stride = 8;
3206 
3207   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3208         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3209         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3210         FOUND_SEQ_CHAR, DONE_LABEL;
3211 
3212   movptr(result, str1);
3213   if (UseAVX >= 2) {
3214     cmpl(cnt1, stride);
3215     jcc(Assembler::less, SCAN_TO_CHAR);
3216     cmpl(cnt1, 2*stride);
3217     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3218     movdl(vec1, ch);
3219     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3220     vpxor(vec2, vec2);
3221     movl(tmp, cnt1);
3222     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3223     andl(cnt1,0x0000000F);  //tail count (in chars)
3224 
3225     bind(SCAN_TO_16_CHAR_LOOP);
3226     vmovdqu(vec3, Address(result, 0));
3227     vpcmpeqw(vec3, vec3, vec1, 1);
3228     vptest(vec2, vec3);
3229     jcc(Assembler::carryClear, FOUND_CHAR);
3230     addptr(result, 32);
3231     subl(tmp, 2*stride);
3232     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3233     jmp(SCAN_TO_8_CHAR);
3234     bind(SCAN_TO_8_CHAR_INIT);
3235     movdl(vec1, ch);
3236     pshuflw(vec1, vec1, 0x00);
3237     pshufd(vec1, vec1, 0);
3238     pxor(vec2, vec2);
3239   }
3240   bind(SCAN_TO_8_CHAR);
3241   cmpl(cnt1, stride);
3242   jcc(Assembler::less, SCAN_TO_CHAR);
3243   if (UseAVX < 2) {
3244     movdl(vec1, ch);
3245     pshuflw(vec1, vec1, 0x00);
3246     pshufd(vec1, vec1, 0);
3247     pxor(vec2, vec2);
3248   }
3249   movl(tmp, cnt1);
3250   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3251   andl(cnt1,0x00000007);  //tail count (in chars)
3252 
3253   bind(SCAN_TO_8_CHAR_LOOP);
3254   movdqu(vec3, Address(result, 0));
3255   pcmpeqw(vec3, vec1);
3256   ptest(vec2, vec3);
3257   jcc(Assembler::carryClear, FOUND_CHAR);
3258   addptr(result, 16);
3259   subl(tmp, stride);
3260   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3261   bind(SCAN_TO_CHAR);
3262   testl(cnt1, cnt1);
3263   jcc(Assembler::zero, RET_NOT_FOUND);
3264   bind(SCAN_TO_CHAR_LOOP);
3265   load_unsigned_short(tmp, Address(result, 0));
3266   cmpl(ch, tmp);
3267   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3268   addptr(result, 2);
3269   subl(cnt1, 1);
3270   jccb(Assembler::zero, RET_NOT_FOUND);
3271   jmp(SCAN_TO_CHAR_LOOP);
3272 
3273   bind(RET_NOT_FOUND);
3274   movl(result, -1);
3275   jmpb(DONE_LABEL);
3276 
3277   bind(FOUND_CHAR);
3278   if (UseAVX >= 2) {
3279     vpmovmskb(tmp, vec3);
3280   } else {
3281     pmovmskb(tmp, vec3);
3282   }
3283   bsfl(ch, tmp);
3284   addptr(result, ch);
3285 
3286   bind(FOUND_SEQ_CHAR);
3287   subptr(result, str1);
3288   shrl(result, 1);
3289 
3290   bind(DONE_LABEL);
3291 } // string_indexof_char
3292 
3293 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3294                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3295   ShortBranchVerifier sbv(this);
3296   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3297 
3298   int stride = 16;
3299 
3300   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3301         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3302         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3303         FOUND_SEQ_CHAR, DONE_LABEL;
3304 
3305   movptr(result, str1);
3306   if (UseAVX >= 2) {
3307     cmpl(cnt1, stride);
3308     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3309     cmpl(cnt1, stride*2);
3310     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3311     movdl(vec1, ch);
3312     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3313     vpxor(vec2, vec2);
3314     movl(tmp, cnt1);
3315     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3316     andl(cnt1,0x0000001F);  //tail count (in chars)
3317 
3318     bind(SCAN_TO_32_CHAR_LOOP);
3319     vmovdqu(vec3, Address(result, 0));
3320     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3321     vptest(vec2, vec3);
3322     jcc(Assembler::carryClear, FOUND_CHAR);
3323     addptr(result, 32);
3324     subl(tmp, stride*2);
3325     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3326     jmp(SCAN_TO_16_CHAR);
3327 
3328     bind(SCAN_TO_16_CHAR_INIT);
3329     movdl(vec1, ch);
3330     pxor(vec2, vec2);
3331     pshufb(vec1, vec2);
3332   }
3333 
3334   bind(SCAN_TO_16_CHAR);
3335   cmpl(cnt1, stride);
3336   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3337   if (UseAVX < 2) {
3338     movdl(vec1, ch);
3339     pxor(vec2, vec2);
3340     pshufb(vec1, vec2);
3341   }
3342   movl(tmp, cnt1);
3343   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3344   andl(cnt1,0x0000000F);  //tail count (in bytes)
3345 
3346   bind(SCAN_TO_16_CHAR_LOOP);
3347   movdqu(vec3, Address(result, 0));
3348   pcmpeqb(vec3, vec1);
3349   ptest(vec2, vec3);
3350   jcc(Assembler::carryClear, FOUND_CHAR);
3351   addptr(result, 16);
3352   subl(tmp, stride);
3353   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3354 
3355   bind(SCAN_TO_CHAR_INIT);
3356   testl(cnt1, cnt1);
3357   jcc(Assembler::zero, RET_NOT_FOUND);
3358   bind(SCAN_TO_CHAR_LOOP);
3359   load_unsigned_byte(tmp, Address(result, 0));
3360   cmpl(ch, tmp);
3361   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3362   addptr(result, 1);
3363   subl(cnt1, 1);
3364   jccb(Assembler::zero, RET_NOT_FOUND);
3365   jmp(SCAN_TO_CHAR_LOOP);
3366 
3367   bind(RET_NOT_FOUND);
3368   movl(result, -1);
3369   jmpb(DONE_LABEL);
3370 
3371   bind(FOUND_CHAR);
3372   if (UseAVX >= 2) {
3373     vpmovmskb(tmp, vec3);
3374   } else {
3375     pmovmskb(tmp, vec3);
3376   }
3377   bsfl(ch, tmp);
3378   addptr(result, ch);
3379 
3380   bind(FOUND_SEQ_CHAR);
3381   subptr(result, str1);
3382 
3383   bind(DONE_LABEL);
3384 } // stringL_indexof_char
3385 
3386 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3387   switch (eltype) {
3388   case T_BOOLEAN: return sizeof(jboolean);
3389   case T_BYTE:  return sizeof(jbyte);
3390   case T_SHORT: return sizeof(jshort);
3391   case T_CHAR:  return sizeof(jchar);
3392   case T_INT:   return sizeof(jint);
3393   default:
3394     ShouldNotReachHere();
3395     return -1;
3396   }
3397 }
3398 
3399 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3400   switch (eltype) {
3401   // T_BOOLEAN used as surrogate for unsigned byte
3402   case T_BOOLEAN: movzbl(dst, src);   break;
3403   case T_BYTE:    movsbl(dst, src);   break;
3404   case T_SHORT:   movswl(dst, src);   break;
3405   case T_CHAR:    movzwl(dst, src);   break;
3406   case T_INT:     movl(dst, src);     break;
3407   default:
3408     ShouldNotReachHere();
3409   }
3410 }
3411 
3412 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3413   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3414 }
3415 
3416 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3417   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3418 }
3419 
3420 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3421   const int vlen = Assembler::AVX_256bit;
3422   switch (eltype) {
3423   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3424   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3425   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3426   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3427   case T_INT:
3428     // do nothing
3429     break;
3430   default:
3431     ShouldNotReachHere();
3432   }
3433 }
3434 
3435 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3436                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3437                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3438                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3439                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3440                                         BasicType eltype) {
3441   ShortBranchVerifier sbv(this);
3442   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3443   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3444   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3445 
3446   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3447         SHORT_UNROLLED_LOOP_EXIT,
3448         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3449         UNROLLED_VECTOR_LOOP_BEGIN,
3450         END;
3451   switch (eltype) {
3452   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3453   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3454   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3455   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3456   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3457   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3458   }
3459 
3460   // For "renaming" for readibility of the code
3461   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3462                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3463                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3464 
3465   const int elsize = arrays_hashcode_elsize(eltype);
3466 
3467   /*
3468     if (cnt1 >= 2) {
3469       if (cnt1 >= 32) {
3470         UNROLLED VECTOR LOOP
3471       }
3472       UNROLLED SCALAR LOOP
3473     }
3474     SINGLE SCALAR
3475    */
3476 
3477   cmpl(cnt1, 32);
3478   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3479 
3480   // cnt1 >= 32 && generate_vectorized_loop
3481   xorl(index, index);
3482 
3483   // vresult = IntVector.zero(I256);
3484   for (int idx = 0; idx < 4; idx++) {
3485     vpxor(vresult[idx], vresult[idx]);
3486   }
3487   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3488   Register bound = tmp2;
3489   Register next = tmp3;
3490   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3491   movl(next, Address(tmp2, 0));
3492   movdl(vnext, next);
3493   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3494 
3495   // index = 0;
3496   // bound = cnt1 & ~(32 - 1);
3497   movl(bound, cnt1);
3498   andl(bound, ~(32 - 1));
3499   // for (; index < bound; index += 32) {
3500   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3501   // result *= next;
3502   imull(result, next);
3503   // loop fission to upfront the cost of fetching from memory, OOO execution
3504   // can then hopefully do a better job of prefetching
3505   for (int idx = 0; idx < 4; idx++) {
3506     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3507   }
3508   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3509   for (int idx = 0; idx < 4; idx++) {
3510     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3511     arrays_hashcode_elvcast(vtmp[idx], eltype);
3512     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3513   }
3514   // index += 32;
3515   addl(index, 32);
3516   // index < bound;
3517   cmpl(index, bound);
3518   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3519   // }
3520 
3521   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3522   subl(cnt1, bound);
3523   // release bound
3524 
3525   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3526   for (int idx = 0; idx < 4; idx++) {
3527     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3528     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3529     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3530   }
3531   // result += vresult.reduceLanes(ADD);
3532   for (int idx = 0; idx < 4; idx++) {
3533     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3534   }
3535 
3536   // } else if (cnt1 < 32) {
3537 
3538   bind(SHORT_UNROLLED_BEGIN);
3539   // int i = 1;
3540   movl(index, 1);
3541   cmpl(index, cnt1);
3542   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3543 
3544   // for (; i < cnt1 ; i += 2) {
3545   bind(SHORT_UNROLLED_LOOP_BEGIN);
3546   movl(tmp3, 961);
3547   imull(result, tmp3);
3548   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3549   movl(tmp3, tmp2);
3550   shll(tmp3, 5);
3551   subl(tmp3, tmp2);
3552   addl(result, tmp3);
3553   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3554   addl(result, tmp3);
3555   addl(index, 2);
3556   cmpl(index, cnt1);
3557   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3558 
3559   // }
3560   // if (i >= cnt1) {
3561   bind(SHORT_UNROLLED_LOOP_EXIT);
3562   jccb(Assembler::greater, END);
3563   movl(tmp2, result);
3564   shll(result, 5);
3565   subl(result, tmp2);
3566   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3567   addl(result, tmp3);
3568   // }
3569   bind(END);
3570 
3571   BLOCK_COMMENT("} // arrays_hashcode");
3572 
3573 } // arrays_hashcode
3574 
3575 // helper function for string_compare
3576 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3577                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3578                                            Address::ScaleFactor scale2, Register index, int ae) {
3579   if (ae == StrIntrinsicNode::LL) {
3580     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3581     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3582   } else if (ae == StrIntrinsicNode::UU) {
3583     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3584     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3585   } else {
3586     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3587     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3588   }
3589 }
3590 
3591 // Compare strings, used for char[] and byte[].
3592 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3593                                        Register cnt1, Register cnt2, Register result,
3594                                        XMMRegister vec1, int ae, KRegister mask) {
3595   ShortBranchVerifier sbv(this);
3596   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3597   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only AVX3
3598   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3599   int stride2x2 = 0x40;
3600   Address::ScaleFactor scale = Address::no_scale;
3601   Address::ScaleFactor scale1 = Address::no_scale;
3602   Address::ScaleFactor scale2 = Address::no_scale;
3603 
3604   if (ae != StrIntrinsicNode::LL) {
3605     stride2x2 = 0x20;
3606   }
3607 
3608   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3609     shrl(cnt2, 1);
3610   }
3611   // Compute the minimum of the string lengths and the
3612   // difference of the string lengths (stack).
3613   // Do the conditional move stuff
3614   movl(result, cnt1);
3615   subl(cnt1, cnt2);
3616   push(cnt1);
3617   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3618 
3619   // Is the minimum length zero?
3620   testl(cnt2, cnt2);
3621   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3622   if (ae == StrIntrinsicNode::LL) {
3623     // Load first bytes
3624     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3625     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3626   } else if (ae == StrIntrinsicNode::UU) {
3627     // Load first characters
3628     load_unsigned_short(result, Address(str1, 0));
3629     load_unsigned_short(cnt1, Address(str2, 0));
3630   } else {
3631     load_unsigned_byte(result, Address(str1, 0));
3632     load_unsigned_short(cnt1, Address(str2, 0));
3633   }
3634   subl(result, cnt1);
3635   jcc(Assembler::notZero,  POP_LABEL);
3636 
3637   if (ae == StrIntrinsicNode::UU) {
3638     // Divide length by 2 to get number of chars
3639     shrl(cnt2, 1);
3640   }
3641   cmpl(cnt2, 1);
3642   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3643 
3644   // Check if the strings start at the same location and setup scale and stride
3645   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3646     cmpptr(str1, str2);
3647     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3648     if (ae == StrIntrinsicNode::LL) {
3649       scale = Address::times_1;
3650       stride = 16;
3651     } else {
3652       scale = Address::times_2;
3653       stride = 8;
3654     }
3655   } else {
3656     scale1 = Address::times_1;
3657     scale2 = Address::times_2;
3658     // scale not used
3659     stride = 8;
3660   }
3661 
3662   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3663     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3664     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3665     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3666     Label COMPARE_TAIL_LONG;
3667     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only AVX3
3668 
3669     int pcmpmask = 0x19;
3670     if (ae == StrIntrinsicNode::LL) {
3671       pcmpmask &= ~0x01;
3672     }
3673 
3674     // Setup to compare 16-chars (32-bytes) vectors,
3675     // start from first character again because it has aligned address.
3676     if (ae == StrIntrinsicNode::LL) {
3677       stride2 = 32;
3678     } else {
3679       stride2 = 16;
3680     }
3681     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3682       adr_stride = stride << scale;
3683     } else {
3684       adr_stride1 = 8;  //stride << scale1;
3685       adr_stride2 = 16; //stride << scale2;
3686     }
3687 
3688     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3689     // rax and rdx are used by pcmpestri as elements counters
3690     movl(result, cnt2);
3691     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3692     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3693 
3694     // fast path : compare first 2 8-char vectors.
3695     bind(COMPARE_16_CHARS);
3696     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3697       movdqu(vec1, Address(str1, 0));
3698     } else {
3699       pmovzxbw(vec1, Address(str1, 0));
3700     }
3701     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3702     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3703 
3704     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3705       movdqu(vec1, Address(str1, adr_stride));
3706       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3707     } else {
3708       pmovzxbw(vec1, Address(str1, adr_stride1));
3709       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3710     }
3711     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3712     addl(cnt1, stride);
3713 
3714     // Compare the characters at index in cnt1
3715     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3716     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3717     subl(result, cnt2);
3718     jmp(POP_LABEL);
3719 
3720     // Setup the registers to start vector comparison loop
3721     bind(COMPARE_WIDE_VECTORS);
3722     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3723       lea(str1, Address(str1, result, scale));
3724       lea(str2, Address(str2, result, scale));
3725     } else {
3726       lea(str1, Address(str1, result, scale1));
3727       lea(str2, Address(str2, result, scale2));
3728     }
3729     subl(result, stride2);
3730     subl(cnt2, stride2);
3731     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3732     negptr(result);
3733 
3734     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3735     bind(COMPARE_WIDE_VECTORS_LOOP);
3736 
3737     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3738       cmpl(cnt2, stride2x2);
3739       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3740       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3741       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3742 
3743       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3744       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3745         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3746         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3747       } else {
3748         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3749         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3750       }
3751       kortestql(mask, mask);
3752       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3753       addptr(result, stride2x2);  // update since we already compared at this addr
3754       subl(cnt2, stride2x2);      // and sub the size too
3755       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3756 
3757       vpxor(vec1, vec1);
3758       jmpb(COMPARE_WIDE_TAIL);
3759     }//if (VM_Version::supports_avx512vlbw())
3760 
3761     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3762     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3763       vmovdqu(vec1, Address(str1, result, scale));
3764       vpxor(vec1, Address(str2, result, scale));
3765     } else {
3766       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3767       vpxor(vec1, Address(str2, result, scale2));
3768     }
3769     vptest(vec1, vec1);
3770     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3771     addptr(result, stride2);
3772     subl(cnt2, stride2);
3773     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3774     // clean upper bits of YMM registers
3775     vpxor(vec1, vec1);
3776 
3777     // compare wide vectors tail
3778     bind(COMPARE_WIDE_TAIL);
3779     testptr(result, result);
3780     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3781 
3782     movl(result, stride2);
3783     movl(cnt2, result);
3784     negptr(result);
3785     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3786 
3787     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3788     bind(VECTOR_NOT_EQUAL);
3789     // clean upper bits of YMM registers
3790     vpxor(vec1, vec1);
3791     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3792       lea(str1, Address(str1, result, scale));
3793       lea(str2, Address(str2, result, scale));
3794     } else {
3795       lea(str1, Address(str1, result, scale1));
3796       lea(str2, Address(str2, result, scale2));
3797     }
3798     jmp(COMPARE_16_CHARS);
3799 
3800     // Compare tail chars, length between 1 to 15 chars
3801     bind(COMPARE_TAIL_LONG);
3802     movl(cnt2, result);
3803     cmpl(cnt2, stride);
3804     jcc(Assembler::less, COMPARE_SMALL_STR);
3805 
3806     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3807       movdqu(vec1, Address(str1, 0));
3808     } else {
3809       pmovzxbw(vec1, Address(str1, 0));
3810     }
3811     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3812     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3813     subptr(cnt2, stride);
3814     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3815     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3816       lea(str1, Address(str1, result, scale));
3817       lea(str2, Address(str2, result, scale));
3818     } else {
3819       lea(str1, Address(str1, result, scale1));
3820       lea(str2, Address(str2, result, scale2));
3821     }
3822     negptr(cnt2);
3823     jmpb(WHILE_HEAD_LABEL);
3824 
3825     bind(COMPARE_SMALL_STR);
3826   } else if (UseSSE42Intrinsics) {
3827     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3828     int pcmpmask = 0x19;
3829     // Setup to compare 8-char (16-byte) vectors,
3830     // start from first character again because it has aligned address.
3831     movl(result, cnt2);
3832     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3833     if (ae == StrIntrinsicNode::LL) {
3834       pcmpmask &= ~0x01;
3835     }
3836     jcc(Assembler::zero, COMPARE_TAIL);
3837     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3838       lea(str1, Address(str1, result, scale));
3839       lea(str2, Address(str2, result, scale));
3840     } else {
3841       lea(str1, Address(str1, result, scale1));
3842       lea(str2, Address(str2, result, scale2));
3843     }
3844     negptr(result);
3845 
3846     // pcmpestri
3847     //   inputs:
3848     //     vec1- substring
3849     //     rax - negative string length (elements count)
3850     //     mem - scanned string
3851     //     rdx - string length (elements count)
3852     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3853     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3854     //   outputs:
3855     //     rcx - first mismatched element index
3856     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3857 
3858     bind(COMPARE_WIDE_VECTORS);
3859     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3860       movdqu(vec1, Address(str1, result, scale));
3861       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3862     } else {
3863       pmovzxbw(vec1, Address(str1, result, scale1));
3864       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3865     }
3866     // After pcmpestri cnt1(rcx) contains mismatched element index
3867 
3868     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3869     addptr(result, stride);
3870     subptr(cnt2, stride);
3871     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3872 
3873     // compare wide vectors tail
3874     testptr(result, result);
3875     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3876 
3877     movl(cnt2, stride);
3878     movl(result, stride);
3879     negptr(result);
3880     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3881       movdqu(vec1, Address(str1, result, scale));
3882       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3883     } else {
3884       pmovzxbw(vec1, Address(str1, result, scale1));
3885       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3886     }
3887     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3888 
3889     // Mismatched characters in the vectors
3890     bind(VECTOR_NOT_EQUAL);
3891     addptr(cnt1, result);
3892     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3893     subl(result, cnt2);
3894     jmpb(POP_LABEL);
3895 
3896     bind(COMPARE_TAIL); // limit is zero
3897     movl(cnt2, result);
3898     // Fallthru to tail compare
3899   }
3900   // Shift str2 and str1 to the end of the arrays, negate min
3901   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3902     lea(str1, Address(str1, cnt2, scale));
3903     lea(str2, Address(str2, cnt2, scale));
3904   } else {
3905     lea(str1, Address(str1, cnt2, scale1));
3906     lea(str2, Address(str2, cnt2, scale2));
3907   }
3908   decrementl(cnt2);  // first character was compared already
3909   negptr(cnt2);
3910 
3911   // Compare the rest of the elements
3912   bind(WHILE_HEAD_LABEL);
3913   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3914   subl(result, cnt1);
3915   jccb(Assembler::notZero, POP_LABEL);
3916   increment(cnt2);
3917   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3918 
3919   // Strings are equal up to min length.  Return the length difference.
3920   bind(LENGTH_DIFF_LABEL);
3921   pop(result);
3922   if (ae == StrIntrinsicNode::UU) {
3923     // Divide diff by 2 to get number of chars
3924     sarl(result, 1);
3925   }
3926   jmpb(DONE_LABEL);
3927 
3928   if (VM_Version::supports_avx512vlbw()) {
3929 
3930     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3931 
3932     kmovql(cnt1, mask);
3933     notq(cnt1);
3934     bsfq(cnt2, cnt1);
3935     if (ae != StrIntrinsicNode::LL) {
3936       // Divide diff by 2 to get number of chars
3937       sarl(cnt2, 1);
3938     }
3939     addq(result, cnt2);
3940     if (ae == StrIntrinsicNode::LL) {
3941       load_unsigned_byte(cnt1, Address(str2, result));
3942       load_unsigned_byte(result, Address(str1, result));
3943     } else if (ae == StrIntrinsicNode::UU) {
3944       load_unsigned_short(cnt1, Address(str2, result, scale));
3945       load_unsigned_short(result, Address(str1, result, scale));
3946     } else {
3947       load_unsigned_short(cnt1, Address(str2, result, scale2));
3948       load_unsigned_byte(result, Address(str1, result, scale1));
3949     }
3950     subl(result, cnt1);
3951     jmpb(POP_LABEL);
3952   }//if (VM_Version::supports_avx512vlbw())
3953 
3954   // Discard the stored length difference
3955   bind(POP_LABEL);
3956   pop(cnt1);
3957 
3958   // That's it
3959   bind(DONE_LABEL);
3960   if(ae == StrIntrinsicNode::UL) {
3961     negl(result);
3962   }
3963 
3964 }
3965 
3966 // Search for Non-ASCII character (Negative byte value) in a byte array,
3967 // return the index of the first such character, otherwise the length
3968 // of the array segment searched.
3969 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3970 //   @IntrinsicCandidate
3971 //   public static int countPositives(byte[] ba, int off, int len) {
3972 //     for (int i = off; i < off + len; i++) {
3973 //       if (ba[i] < 0) {
3974 //         return i - off;
3975 //       }
3976 //     }
3977 //     return len;
3978 //   }
3979 void C2_MacroAssembler::count_positives(Register ary1, Register len,
3980   Register result, Register tmp1,
3981   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3982   // rsi: byte array
3983   // rcx: len
3984   // rax: result
3985   ShortBranchVerifier sbv(this);
3986   assert_different_registers(ary1, len, result, tmp1);
3987   assert_different_registers(vec1, vec2);
3988   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3989 
3990   movl(result, len); // copy
3991   // len == 0
3992   testl(len, len);
3993   jcc(Assembler::zero, DONE);
3994 
3995   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3996     VM_Version::supports_avx512vlbw() &&
3997     VM_Version::supports_bmi2()) {
3998 
3999     Label test_64_loop, test_tail, BREAK_LOOP;
4000     movl(tmp1, len);
4001     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4002 
4003     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4004     andl(len,  0xffffffc0); // vector count (in chars)
4005     jccb(Assembler::zero, test_tail);
4006 
4007     lea(ary1, Address(ary1, len, Address::times_1));
4008     negptr(len);
4009 
4010     bind(test_64_loop);
4011     // Check whether our 64 elements of size byte contain negatives
4012     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4013     kortestql(mask1, mask1);
4014     jcc(Assembler::notZero, BREAK_LOOP);
4015 
4016     addptr(len, 64);
4017     jccb(Assembler::notZero, test_64_loop);
4018 
4019     bind(test_tail);
4020     // bail out when there is nothing to be done
4021     testl(tmp1, -1);
4022     jcc(Assembler::zero, DONE);
4023 
4024 
4025     // check the tail for absense of negatives
4026     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4027     {
4028       Register tmp3_aliased = len;
4029       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4030       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4031       notq(tmp3_aliased);
4032       kmovql(mask2, tmp3_aliased);
4033     }
4034 
4035     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4036     ktestq(mask1, mask2);
4037     jcc(Assembler::zero, DONE);
4038 
4039     // do a full check for negative registers in the tail
4040     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4041                      // ary1 already pointing to the right place
4042     jmpb(TAIL_START);
4043 
4044     bind(BREAK_LOOP);
4045     // At least one byte in the last 64 byte block was negative.
4046     // Set up to look at the last 64 bytes as if they were a tail
4047     lea(ary1, Address(ary1, len, Address::times_1));
4048     addptr(result, len);
4049     // Ignore the very last byte: if all others are positive,
4050     // it must be negative, so we can skip right to the 2+1 byte
4051     // end comparison at this point
4052     orl(result, 63);
4053     movl(len, 63);
4054     // Fallthru to tail compare
4055   } else {
4056 
4057     if (UseAVX >= 2) {
4058       // With AVX2, use 32-byte vector compare
4059       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4060 
4061       // Compare 32-byte vectors
4062       testl(len, 0xffffffe0);   // vector count (in bytes)
4063       jccb(Assembler::zero, TAIL_START);
4064 
4065       andl(len, 0xffffffe0);
4066       lea(ary1, Address(ary1, len, Address::times_1));
4067       negptr(len);
4068 
4069       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4070       movdl(vec2, tmp1);
4071       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4072 
4073       bind(COMPARE_WIDE_VECTORS);
4074       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4075       vptest(vec1, vec2);
4076       jccb(Assembler::notZero, BREAK_LOOP);
4077       addptr(len, 32);
4078       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4079 
4080       testl(result, 0x0000001f);   // any bytes remaining?
4081       jcc(Assembler::zero, DONE);
4082 
4083       // Quick test using the already prepared vector mask
4084       movl(len, result);
4085       andl(len, 0x0000001f);
4086       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4087       vptest(vec1, vec2);
4088       jcc(Assembler::zero, DONE);
4089       // There are zeros, jump to the tail to determine exactly where
4090       jmpb(TAIL_START);
4091 
4092       bind(BREAK_LOOP);
4093       // At least one byte in the last 32-byte vector is negative.
4094       // Set up to look at the last 32 bytes as if they were a tail
4095       lea(ary1, Address(ary1, len, Address::times_1));
4096       addptr(result, len);
4097       // Ignore the very last byte: if all others are positive,
4098       // it must be negative, so we can skip right to the 2+1 byte
4099       // end comparison at this point
4100       orl(result, 31);
4101       movl(len, 31);
4102       // Fallthru to tail compare
4103     } else if (UseSSE42Intrinsics) {
4104       // With SSE4.2, use double quad vector compare
4105       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4106 
4107       // Compare 16-byte vectors
4108       testl(len, 0xfffffff0);   // vector count (in bytes)
4109       jcc(Assembler::zero, TAIL_START);
4110 
4111       andl(len, 0xfffffff0);
4112       lea(ary1, Address(ary1, len, Address::times_1));
4113       negptr(len);
4114 
4115       movl(tmp1, 0x80808080);
4116       movdl(vec2, tmp1);
4117       pshufd(vec2, vec2, 0);
4118 
4119       bind(COMPARE_WIDE_VECTORS);
4120       movdqu(vec1, Address(ary1, len, Address::times_1));
4121       ptest(vec1, vec2);
4122       jccb(Assembler::notZero, BREAK_LOOP);
4123       addptr(len, 16);
4124       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4125 
4126       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4127       jcc(Assembler::zero, DONE);
4128 
4129       // Quick test using the already prepared vector mask
4130       movl(len, result);
4131       andl(len, 0x0000000f);   // tail count (in bytes)
4132       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4133       ptest(vec1, vec2);
4134       jcc(Assembler::zero, DONE);
4135       jmpb(TAIL_START);
4136 
4137       bind(BREAK_LOOP);
4138       // At least one byte in the last 16-byte vector is negative.
4139       // Set up and look at the last 16 bytes as if they were a tail
4140       lea(ary1, Address(ary1, len, Address::times_1));
4141       addptr(result, len);
4142       // Ignore the very last byte: if all others are positive,
4143       // it must be negative, so we can skip right to the 2+1 byte
4144       // end comparison at this point
4145       orl(result, 15);
4146       movl(len, 15);
4147       // Fallthru to tail compare
4148     }
4149   }
4150 
4151   bind(TAIL_START);
4152   // Compare 4-byte vectors
4153   andl(len, 0xfffffffc); // vector count (in bytes)
4154   jccb(Assembler::zero, COMPARE_CHAR);
4155 
4156   lea(ary1, Address(ary1, len, Address::times_1));
4157   negptr(len);
4158 
4159   bind(COMPARE_VECTORS);
4160   movl(tmp1, Address(ary1, len, Address::times_1));
4161   andl(tmp1, 0x80808080);
4162   jccb(Assembler::notZero, TAIL_ADJUST);
4163   addptr(len, 4);
4164   jccb(Assembler::notZero, COMPARE_VECTORS);
4165 
4166   // Compare trailing char (final 2-3 bytes), if any
4167   bind(COMPARE_CHAR);
4168 
4169   testl(result, 0x2);   // tail  char
4170   jccb(Assembler::zero, COMPARE_BYTE);
4171   load_unsigned_short(tmp1, Address(ary1, 0));
4172   andl(tmp1, 0x00008080);
4173   jccb(Assembler::notZero, CHAR_ADJUST);
4174   lea(ary1, Address(ary1, 2));
4175 
4176   bind(COMPARE_BYTE);
4177   testl(result, 0x1);   // tail  byte
4178   jccb(Assembler::zero, DONE);
4179   load_unsigned_byte(tmp1, Address(ary1, 0));
4180   testl(tmp1, 0x00000080);
4181   jccb(Assembler::zero, DONE);
4182   subptr(result, 1);
4183   jmpb(DONE);
4184 
4185   bind(TAIL_ADJUST);
4186   // there are negative bits in the last 4 byte block.
4187   // Adjust result and check the next three bytes
4188   addptr(result, len);
4189   orl(result, 3);
4190   lea(ary1, Address(ary1, len, Address::times_1));
4191   jmpb(COMPARE_CHAR);
4192 
4193   bind(CHAR_ADJUST);
4194   // We are looking at a char + optional byte tail, and found that one
4195   // of the bytes in the char is negative. Adjust the result, check the
4196   // first byte and readjust if needed.
4197   andl(result, 0xfffffffc);
4198   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4199   jccb(Assembler::notZero, DONE);
4200   addptr(result, 1);
4201 
4202   // That's it
4203   bind(DONE);
4204   if (UseAVX >= 2) {
4205     // clean upper bits of YMM registers
4206     vpxor(vec1, vec1);
4207     vpxor(vec2, vec2);
4208   }
4209 }
4210 
4211 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4212 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4213                                       Register limit, Register result, Register chr,
4214                                       XMMRegister vec1, XMMRegister vec2, bool is_char,
4215                                       KRegister mask, bool expand_ary2) {
4216   // for expand_ary2, limit is the (smaller) size of the second array.
4217   ShortBranchVerifier sbv(this);
4218   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4219 
4220   assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4221          "Expansion only implemented for AVX2");
4222 
4223   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4224   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4225 
4226   Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4227   int scaleIncr = expand_ary2 ? 8 : 16;
4228 
4229   if (is_array_equ) {
4230     // Check the input args
4231     cmpoop(ary1, ary2);
4232     jcc(Assembler::equal, TRUE_LABEL);
4233 
4234     // Need additional checks for arrays_equals.
4235     testptr(ary1, ary1);
4236     jcc(Assembler::zero, FALSE_LABEL);
4237     testptr(ary2, ary2);
4238     jcc(Assembler::zero, FALSE_LABEL);
4239 
4240     // Check the lengths
4241     movl(limit, Address(ary1, length_offset));
4242     cmpl(limit, Address(ary2, length_offset));
4243     jcc(Assembler::notEqual, FALSE_LABEL);
4244   }
4245 
4246   // count == 0
4247   testl(limit, limit);
4248   jcc(Assembler::zero, TRUE_LABEL);
4249 
4250   if (is_array_equ) {
4251     // Load array address
4252     lea(ary1, Address(ary1, base_offset));
4253     lea(ary2, Address(ary2, base_offset));
4254   }
4255 
4256   if (is_array_equ && is_char) {
4257     // arrays_equals when used for char[].
4258     shll(limit, 1);      // byte count != 0
4259   }
4260   movl(result, limit); // copy
4261 
4262   if (UseAVX >= 2) {
4263     // With AVX2, use 32-byte vector compare
4264     Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4265 
4266     // Compare 32-byte vectors
4267     if (expand_ary2) {
4268       andl(result, 0x0000000f);  //   tail count (in bytes)
4269       andl(limit, 0xfffffff0);   // vector count (in bytes)
4270       jcc(Assembler::zero, COMPARE_TAIL);
4271     } else {
4272       andl(result, 0x0000001f);  //   tail count (in bytes)
4273       andl(limit, 0xffffffe0);   // vector count (in bytes)
4274       jcc(Assembler::zero, COMPARE_TAIL_16);
4275     }
4276 
4277     lea(ary1, Address(ary1, limit, scaleFactor));
4278     lea(ary2, Address(ary2, limit, Address::times_1));
4279     negptr(limit);
4280 
4281     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4282       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4283 
4284       cmpl(limit, -64);
4285       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4286 
4287       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4288 
4289       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4290       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4291       kortestql(mask, mask);
4292       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4293       addptr(limit, 64);  // update since we already compared at this addr
4294       cmpl(limit, -64);
4295       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4296 
4297       // At this point we may still need to compare -limit+result bytes.
4298       // We could execute the next two instruction and just continue via non-wide path:
4299       //  cmpl(limit, 0);
4300       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4301       // But since we stopped at the points ary{1,2}+limit which are
4302       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4303       // (|limit| <= 32 and result < 32),
4304       // we may just compare the last 64 bytes.
4305       //
4306       addptr(result, -64);   // it is safe, bc we just came from this area
4307       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4308       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4309       kortestql(mask, mask);
4310       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4311 
4312       jmp(TRUE_LABEL);
4313 
4314       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4315 
4316     }//if (VM_Version::supports_avx512vlbw())
4317 
4318     bind(COMPARE_WIDE_VECTORS);
4319     vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4320     if (expand_ary2) {
4321       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4322     } else {
4323       vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4324     }
4325     vpxor(vec1, vec2);
4326 
4327     vptest(vec1, vec1);
4328     jcc(Assembler::notZero, FALSE_LABEL);
4329     addptr(limit, scaleIncr * 2);
4330     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4331 
4332     testl(result, result);
4333     jcc(Assembler::zero, TRUE_LABEL);
4334 
4335     vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4336     if (expand_ary2) {
4337       vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4338     } else {
4339       vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4340     }
4341     vpxor(vec1, vec2);
4342 
4343     vptest(vec1, vec1);
4344     jcc(Assembler::notZero, FALSE_LABEL);
4345     jmp(TRUE_LABEL);
4346 
4347     bind(COMPARE_TAIL_16); // limit is zero
4348     movl(limit, result);
4349 
4350     // Compare 16-byte chunks
4351     andl(result, 0x0000000f);  //   tail count (in bytes)
4352     andl(limit, 0xfffffff0);   // vector count (in bytes)
4353     jcc(Assembler::zero, COMPARE_TAIL);
4354 
4355     lea(ary1, Address(ary1, limit, scaleFactor));
4356     lea(ary2, Address(ary2, limit, Address::times_1));
4357     negptr(limit);
4358 
4359     bind(COMPARE_WIDE_VECTORS_16);
4360     movdqu(vec1, Address(ary1, limit, scaleFactor));
4361     if (expand_ary2) {
4362       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4363     } else {
4364       movdqu(vec2, Address(ary2, limit, Address::times_1));
4365     }
4366     pxor(vec1, vec2);
4367 
4368     ptest(vec1, vec1);
4369     jcc(Assembler::notZero, FALSE_LABEL);
4370     addptr(limit, scaleIncr);
4371     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4372 
4373     bind(COMPARE_TAIL); // limit is zero
4374     movl(limit, result);
4375     // Fallthru to tail compare
4376   } else if (UseSSE42Intrinsics) {
4377     // With SSE4.2, use double quad vector compare
4378     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4379 
4380     // Compare 16-byte vectors
4381     andl(result, 0x0000000f);  //   tail count (in bytes)
4382     andl(limit, 0xfffffff0);   // vector count (in bytes)
4383     jcc(Assembler::zero, COMPARE_TAIL);
4384 
4385     lea(ary1, Address(ary1, limit, Address::times_1));
4386     lea(ary2, Address(ary2, limit, Address::times_1));
4387     negptr(limit);
4388 
4389     bind(COMPARE_WIDE_VECTORS);
4390     movdqu(vec1, Address(ary1, limit, Address::times_1));
4391     movdqu(vec2, Address(ary2, limit, Address::times_1));
4392     pxor(vec1, vec2);
4393 
4394     ptest(vec1, vec1);
4395     jcc(Assembler::notZero, FALSE_LABEL);
4396     addptr(limit, 16);
4397     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4398 
4399     testl(result, result);
4400     jcc(Assembler::zero, TRUE_LABEL);
4401 
4402     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4403     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4404     pxor(vec1, vec2);
4405 
4406     ptest(vec1, vec1);
4407     jccb(Assembler::notZero, FALSE_LABEL);
4408     jmpb(TRUE_LABEL);
4409 
4410     bind(COMPARE_TAIL); // limit is zero
4411     movl(limit, result);
4412     // Fallthru to tail compare
4413   }
4414 
4415   // Compare 4-byte vectors
4416   if (expand_ary2) {
4417     testl(result, result);
4418     jccb(Assembler::zero, TRUE_LABEL);
4419   } else {
4420     andl(limit, 0xfffffffc); // vector count (in bytes)
4421     jccb(Assembler::zero, COMPARE_CHAR);
4422   }
4423 
4424   lea(ary1, Address(ary1, limit, scaleFactor));
4425   lea(ary2, Address(ary2, limit, Address::times_1));
4426   negptr(limit);
4427 
4428   bind(COMPARE_VECTORS);
4429   if (expand_ary2) {
4430     // There are no "vector" operations for bytes to shorts
4431     movzbl(chr, Address(ary2, limit, Address::times_1));
4432     cmpw(Address(ary1, limit, Address::times_2), chr);
4433     jccb(Assembler::notEqual, FALSE_LABEL);
4434     addptr(limit, 1);
4435     jcc(Assembler::notZero, COMPARE_VECTORS);
4436     jmp(TRUE_LABEL);
4437   } else {
4438     movl(chr, Address(ary1, limit, Address::times_1));
4439     cmpl(chr, Address(ary2, limit, Address::times_1));
4440     jccb(Assembler::notEqual, FALSE_LABEL);
4441     addptr(limit, 4);
4442     jcc(Assembler::notZero, COMPARE_VECTORS);
4443   }
4444 
4445   // Compare trailing char (final 2 bytes), if any
4446   bind(COMPARE_CHAR);
4447   testl(result, 0x2);   // tail  char
4448   jccb(Assembler::zero, COMPARE_BYTE);
4449   load_unsigned_short(chr, Address(ary1, 0));
4450   load_unsigned_short(limit, Address(ary2, 0));
4451   cmpl(chr, limit);
4452   jccb(Assembler::notEqual, FALSE_LABEL);
4453 
4454   if (is_array_equ && is_char) {
4455     bind(COMPARE_BYTE);
4456   } else {
4457     lea(ary1, Address(ary1, 2));
4458     lea(ary2, Address(ary2, 2));
4459 
4460     bind(COMPARE_BYTE);
4461     testl(result, 0x1);   // tail  byte
4462     jccb(Assembler::zero, TRUE_LABEL);
4463     load_unsigned_byte(chr, Address(ary1, 0));
4464     load_unsigned_byte(limit, Address(ary2, 0));
4465     cmpl(chr, limit);
4466     jccb(Assembler::notEqual, FALSE_LABEL);
4467   }
4468   bind(TRUE_LABEL);
4469   movl(result, 1);   // return true
4470   jmpb(DONE);
4471 
4472   bind(FALSE_LABEL);
4473   xorl(result, result); // return false
4474 
4475   // That's it
4476   bind(DONE);
4477   if (UseAVX >= 2) {
4478     // clean upper bits of YMM registers
4479     vpxor(vec1, vec1);
4480     vpxor(vec2, vec2);
4481   }
4482 }
4483 
4484 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4485 #define __ masm.
4486   Register dst = stub.data<0>();
4487   XMMRegister src = stub.data<1>();
4488   address target = stub.data<2>();
4489   __ bind(stub.entry());
4490   __ subptr(rsp, 8);
4491   __ movdbl(Address(rsp), src);
4492   __ call(RuntimeAddress(target));
4493   // APX REX2 encoding for pop(dst) increases the stub size by 1 byte.
4494   __ pop(dst);
4495   __ jmp(stub.continuation());
4496 #undef __
4497 }
4498 
4499 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4500   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4501   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4502 
4503   address slowpath_target;
4504   if (dst_bt == T_INT) {
4505     if (src_bt == T_FLOAT) {
4506       cvttss2sil(dst, src);
4507       cmpl(dst, 0x80000000);
4508       slowpath_target = StubRoutines::x86::f2i_fixup();
4509     } else {
4510       cvttsd2sil(dst, src);
4511       cmpl(dst, 0x80000000);
4512       slowpath_target = StubRoutines::x86::d2i_fixup();
4513     }
4514   } else {
4515     if (src_bt == T_FLOAT) {
4516       cvttss2siq(dst, src);
4517       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4518       slowpath_target = StubRoutines::x86::f2l_fixup();
4519     } else {
4520       cvttsd2siq(dst, src);
4521       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4522       slowpath_target = StubRoutines::x86::d2l_fixup();
4523     }
4524   }
4525 
4526   // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte.
4527   int max_size = 23 + (UseAPX ? 1 : 0);
4528   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath);
4529   jcc(Assembler::equal, stub->entry());
4530   bind(stub->continuation());
4531 }
4532 
4533 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4534                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4535   switch(ideal_opc) {
4536     case Op_LShiftVS:
4537       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4538     case Op_LShiftVI:
4539       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4540     case Op_LShiftVL:
4541       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4542     case Op_RShiftVS:
4543       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4544     case Op_RShiftVI:
4545       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4546     case Op_RShiftVL:
4547       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4548     case Op_URShiftVS:
4549       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4550     case Op_URShiftVI:
4551       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4552     case Op_URShiftVL:
4553       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4554     case Op_RotateRightV:
4555       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4556     case Op_RotateLeftV:
4557       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4558     default:
4559       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4560       break;
4561   }
4562 }
4563 
4564 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4565                                                XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4566   if (is_unsigned) {
4567     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4568   } else {
4569     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4570   }
4571 }
4572 
4573 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4574                                                       XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4575   switch (elem_bt) {
4576     case T_BYTE:
4577       if (ideal_opc == Op_SaturatingAddV) {
4578         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4579       } else {
4580         assert(ideal_opc == Op_SaturatingSubV, "");
4581         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4582       }
4583       break;
4584     case T_SHORT:
4585       if (ideal_opc == Op_SaturatingAddV) {
4586         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4587       } else {
4588         assert(ideal_opc == Op_SaturatingSubV, "");
4589         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4590       }
4591       break;
4592     default:
4593       fatal("Unsupported type %s", type2name(elem_bt));
4594       break;
4595   }
4596 }
4597 
4598 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4599                                                         XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4600   switch (elem_bt) {
4601     case T_BYTE:
4602       if (ideal_opc == Op_SaturatingAddV) {
4603         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4604       } else {
4605         assert(ideal_opc == Op_SaturatingSubV, "");
4606         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4607       }
4608       break;
4609     case T_SHORT:
4610       if (ideal_opc == Op_SaturatingAddV) {
4611         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4612       } else {
4613         assert(ideal_opc == Op_SaturatingSubV, "");
4614         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4615       }
4616       break;
4617     default:
4618       fatal("Unsupported type %s", type2name(elem_bt));
4619       break;
4620   }
4621 }
4622 
4623 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4624                                                Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4625   if (is_unsigned) {
4626     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4627   } else {
4628     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4629   }
4630 }
4631 
4632 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4633                                                       XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4634   switch (elem_bt) {
4635     case T_BYTE:
4636       if (ideal_opc == Op_SaturatingAddV) {
4637         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4638       } else {
4639         assert(ideal_opc == Op_SaturatingSubV, "");
4640         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4641       }
4642       break;
4643     case T_SHORT:
4644       if (ideal_opc == Op_SaturatingAddV) {
4645         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4646       } else {
4647         assert(ideal_opc == Op_SaturatingSubV, "");
4648         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4649       }
4650       break;
4651     default:
4652       fatal("Unsupported type %s", type2name(elem_bt));
4653       break;
4654   }
4655 }
4656 
4657 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4658                                                         XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4659   switch (elem_bt) {
4660     case T_BYTE:
4661       if (ideal_opc == Op_SaturatingAddV) {
4662         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4663       } else {
4664         assert(ideal_opc == Op_SaturatingSubV, "");
4665         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4666       }
4667       break;
4668     case T_SHORT:
4669       if (ideal_opc == Op_SaturatingAddV) {
4670         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4671       } else {
4672         assert(ideal_opc == Op_SaturatingSubV, "");
4673         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4674       }
4675       break;
4676     default:
4677       fatal("Unsupported type %s", type2name(elem_bt));
4678       break;
4679   }
4680 }
4681 
4682 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4683                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4684                                     bool is_varshift) {
4685   switch (ideal_opc) {
4686     case Op_AddVB:
4687       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4688     case Op_AddVS:
4689       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4690     case Op_AddVI:
4691       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4692     case Op_AddVL:
4693       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4694     case Op_AddVF:
4695       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4696     case Op_AddVD:
4697       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4698     case Op_SubVB:
4699       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4700     case Op_SubVS:
4701       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4702     case Op_SubVI:
4703       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4704     case Op_SubVL:
4705       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4706     case Op_SubVF:
4707       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4708     case Op_SubVD:
4709       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4710     case Op_MulVS:
4711       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4712     case Op_MulVI:
4713       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4714     case Op_MulVL:
4715       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4716     case Op_MulVF:
4717       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4718     case Op_MulVD:
4719       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4720     case Op_DivVF:
4721       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4722     case Op_DivVD:
4723       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4724     case Op_SqrtVF:
4725       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4726     case Op_SqrtVD:
4727       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4728     case Op_AbsVB:
4729       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4730     case Op_AbsVS:
4731       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4732     case Op_AbsVI:
4733       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4734     case Op_AbsVL:
4735       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4736     case Op_FmaVF:
4737       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4738     case Op_FmaVD:
4739       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4740     case Op_VectorRearrange:
4741       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4742     case Op_LShiftVS:
4743       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4744     case Op_LShiftVI:
4745       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4746     case Op_LShiftVL:
4747       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4748     case Op_RShiftVS:
4749       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4750     case Op_RShiftVI:
4751       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4752     case Op_RShiftVL:
4753       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4754     case Op_URShiftVS:
4755       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4756     case Op_URShiftVI:
4757       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4758     case Op_URShiftVL:
4759       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4760     case Op_RotateLeftV:
4761       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4762     case Op_RotateRightV:
4763       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4764     case Op_MaxV:
4765       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4766     case Op_MinV:
4767       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4768     case Op_UMinV:
4769       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4770     case Op_UMaxV:
4771       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4772     case Op_XorV:
4773       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4774     case Op_OrV:
4775       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4776     case Op_AndV:
4777       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4778     default:
4779       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4780       break;
4781   }
4782 }
4783 
4784 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4785                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4786   switch (ideal_opc) {
4787     case Op_AddVB:
4788       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4789     case Op_AddVS:
4790       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4791     case Op_AddVI:
4792       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4793     case Op_AddVL:
4794       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4795     case Op_AddVF:
4796       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4797     case Op_AddVD:
4798       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4799     case Op_SubVB:
4800       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4801     case Op_SubVS:
4802       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4803     case Op_SubVI:
4804       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4805     case Op_SubVL:
4806       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4807     case Op_SubVF:
4808       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4809     case Op_SubVD:
4810       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4811     case Op_MulVS:
4812       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4813     case Op_MulVI:
4814       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4815     case Op_MulVL:
4816       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4817     case Op_MulVF:
4818       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4819     case Op_MulVD:
4820       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4821     case Op_DivVF:
4822       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4823     case Op_DivVD:
4824       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4825     case Op_FmaVF:
4826       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4827     case Op_FmaVD:
4828       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4829     case Op_MaxV:
4830       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4831     case Op_MinV:
4832       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4833     case Op_UMaxV:
4834       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4835     case Op_UMinV:
4836       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4837     case Op_XorV:
4838       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4839     case Op_OrV:
4840       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4841     case Op_AndV:
4842       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4843     default:
4844       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4845       break;
4846   }
4847 }
4848 
4849 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4850                                   KRegister src1, KRegister src2) {
4851   BasicType etype = T_ILLEGAL;
4852   switch(mask_len) {
4853     case 2:
4854     case 4:
4855     case 8:  etype = T_BYTE; break;
4856     case 16: etype = T_SHORT; break;
4857     case 32: etype = T_INT; break;
4858     case 64: etype = T_LONG; break;
4859     default: fatal("Unsupported type"); break;
4860   }
4861   assert(etype != T_ILLEGAL, "");
4862   switch(ideal_opc) {
4863     case Op_AndVMask:
4864       kand(etype, dst, src1, src2); break;
4865     case Op_OrVMask:
4866       kor(etype, dst, src1, src2); break;
4867     case Op_XorVMask:
4868       kxor(etype, dst, src1, src2); break;
4869     default:
4870       fatal("Unsupported masked operation"); break;
4871   }
4872 }
4873 
4874 /*
4875  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4876  * If src is NaN, the result is 0.
4877  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4878  * the result is equal to the value of Integer.MIN_VALUE.
4879  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4880  * the result is equal to the value of Integer.MAX_VALUE.
4881  */
4882 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4883                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4884                                                                    Register rscratch, AddressLiteral float_sign_flip,
4885                                                                    int vec_enc) {
4886   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4887   Label done;
4888   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4889   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4890   vptest(xtmp2, xtmp2, vec_enc);
4891   jccb(Assembler::equal, done);
4892 
4893   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4894   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4895 
4896   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4897   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4898   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4899 
4900   // Recompute the mask for remaining special value.
4901   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4902   // Extract SRC values corresponding to TRUE mask lanes.
4903   vpand(xtmp4, xtmp2, src, vec_enc);
4904   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4905   // values are set.
4906   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4907 
4908   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4909   bind(done);
4910 }
4911 
4912 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4913                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4914                                                                     Register rscratch, AddressLiteral float_sign_flip,
4915                                                                     int vec_enc) {
4916   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4917   Label done;
4918   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4919   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4920   kortestwl(ktmp1, ktmp1);
4921   jccb(Assembler::equal, done);
4922 
4923   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4924   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4925   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4926 
4927   kxorwl(ktmp1, ktmp1, ktmp2);
4928   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4929   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4930   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4931   bind(done);
4932 }
4933 
4934 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4935                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4936                                                                      Register rscratch, AddressLiteral double_sign_flip,
4937                                                                      int vec_enc) {
4938   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4939 
4940   Label done;
4941   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4942   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4943   kortestwl(ktmp1, ktmp1);
4944   jccb(Assembler::equal, done);
4945 
4946   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4947   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4948   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4949 
4950   kxorwl(ktmp1, ktmp1, ktmp2);
4951   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4952   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4953   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4954   bind(done);
4955 }
4956 
4957 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4958                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4959                                                                      Register rscratch, AddressLiteral float_sign_flip,
4960                                                                      int vec_enc) {
4961   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4962   Label done;
4963   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4964   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4965   kortestwl(ktmp1, ktmp1);
4966   jccb(Assembler::equal, done);
4967 
4968   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4969   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4970   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4971 
4972   kxorwl(ktmp1, ktmp1, ktmp2);
4973   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4974   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4975   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4976   bind(done);
4977 }
4978 
4979 /*
4980  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4981  * If src is NaN, the result is 0.
4982  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
4983  * the result is equal to the value of Long.MIN_VALUE.
4984  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
4985  * the result is equal to the value of Long.MAX_VALUE.
4986  */
4987 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4988                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4989                                                                       Register rscratch, AddressLiteral double_sign_flip,
4990                                                                       int vec_enc) {
4991   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4992 
4993   Label done;
4994   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4995   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
4996   kortestwl(ktmp1, ktmp1);
4997   jccb(Assembler::equal, done);
4998 
4999   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5000   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5001   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5002 
5003   kxorwl(ktmp1, ktmp1, ktmp2);
5004   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5005   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5006   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5007   bind(done);
5008 }
5009 
5010 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5011                                                              XMMRegister xtmp, int index, int vec_enc) {
5012    assert(vec_enc < Assembler::AVX_512bit, "");
5013    if (vec_enc == Assembler::AVX_256bit) {
5014      vextractf128_high(xtmp, src);
5015      vshufps(dst, src, xtmp, index, vec_enc);
5016    } else {
5017      vshufps(dst, src, zero, index, vec_enc);
5018    }
5019 }
5020 
5021 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5022                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5023                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
5024   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5025 
5026   Label done;
5027   // Compare the destination lanes with float_sign_flip
5028   // value to get mask for all special values.
5029   movdqu(xtmp1, float_sign_flip, rscratch);
5030   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5031   ptest(xtmp2, xtmp2);
5032   jccb(Assembler::equal, done);
5033 
5034   // Flip float_sign_flip to get max integer value.
5035   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5036   pxor(xtmp1, xtmp4);
5037 
5038   // Set detination lanes corresponding to unordered source lanes as zero.
5039   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5040   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5041 
5042   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5043   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5044   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5045 
5046   // Recompute the mask for remaining special value.
5047   pxor(xtmp2, xtmp3);
5048   // Extract mask corresponding to non-negative source lanes.
5049   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5050 
5051   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5052   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5053   pand(xtmp3, xtmp2);
5054 
5055   // Replace destination lanes holding special value(0x80000000) with max int
5056   // if corresponding source lane holds a +ve value.
5057   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5058   bind(done);
5059 }
5060 
5061 
5062 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5063                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5064   switch(to_elem_bt) {
5065     case T_SHORT:
5066       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5067       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5068       vpackusdw(dst, dst, zero, vec_enc);
5069       if (vec_enc == Assembler::AVX_256bit) {
5070         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5071       }
5072       break;
5073     case  T_BYTE:
5074       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5075       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5076       vpackusdw(dst, dst, zero, vec_enc);
5077       if (vec_enc == Assembler::AVX_256bit) {
5078         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5079       }
5080       vpackuswb(dst, dst, zero, vec_enc);
5081       break;
5082     default: assert(false, "%s", type2name(to_elem_bt));
5083   }
5084 }
5085 
5086 /*
5087  * Algorithm for vector D2L and F2I conversions:-
5088  * a) Perform vector D2L/F2I cast.
5089  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5090  *    It signifies that source value could be any of the special floating point
5091  *    values(NaN,-Inf,Inf,Max,-Min).
5092  * c) Set destination to zero if source is NaN value.
5093  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5094  */
5095 
5096 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5097                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5098                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5099   int to_elem_sz = type2aelembytes(to_elem_bt);
5100   assert(to_elem_sz <= 4, "");
5101   vcvttps2dq(dst, src, vec_enc);
5102   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5103   if (to_elem_sz < 4) {
5104     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5105     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5106   }
5107 }
5108 
5109 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5110                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5111                                             Register rscratch, int vec_enc) {
5112   int to_elem_sz = type2aelembytes(to_elem_bt);
5113   assert(to_elem_sz <= 4, "");
5114   vcvttps2dq(dst, src, vec_enc);
5115   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5116   switch(to_elem_bt) {
5117     case T_INT:
5118       break;
5119     case T_SHORT:
5120       evpmovdw(dst, dst, vec_enc);
5121       break;
5122     case T_BYTE:
5123       evpmovdb(dst, dst, vec_enc);
5124       break;
5125     default: assert(false, "%s", type2name(to_elem_bt));
5126   }
5127 }
5128 
5129 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5130                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5131                                             Register rscratch, int vec_enc) {
5132   evcvttps2qq(dst, src, vec_enc);
5133   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5134 }
5135 
5136 // Handling for downcasting from double to integer or sub-word types on AVX2.
5137 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5138                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5139                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5140   int to_elem_sz = type2aelembytes(to_elem_bt);
5141   assert(to_elem_sz < 8, "");
5142   vcvttpd2dq(dst, src, vec_enc);
5143   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5144                                               float_sign_flip, vec_enc);
5145   if (to_elem_sz < 4) {
5146     // xtmp4 holds all zero lanes.
5147     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5148   }
5149 }
5150 
5151 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5152                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5153                                             KRegister ktmp2, AddressLiteral sign_flip,
5154                                             Register rscratch, int vec_enc) {
5155   if (VM_Version::supports_avx512dq()) {
5156     evcvttpd2qq(dst, src, vec_enc);
5157     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5158     switch(to_elem_bt) {
5159       case T_LONG:
5160         break;
5161       case T_INT:
5162         evpmovsqd(dst, dst, vec_enc);
5163         break;
5164       case T_SHORT:
5165         evpmovsqd(dst, dst, vec_enc);
5166         evpmovdw(dst, dst, vec_enc);
5167         break;
5168       case T_BYTE:
5169         evpmovsqd(dst, dst, vec_enc);
5170         evpmovdb(dst, dst, vec_enc);
5171         break;
5172       default: assert(false, "%s", type2name(to_elem_bt));
5173     }
5174   } else {
5175     assert(type2aelembytes(to_elem_bt) <= 4, "");
5176     vcvttpd2dq(dst, src, vec_enc);
5177     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5178     switch(to_elem_bt) {
5179       case T_INT:
5180         break;
5181       case T_SHORT:
5182         evpmovdw(dst, dst, vec_enc);
5183         break;
5184       case T_BYTE:
5185         evpmovdb(dst, dst, vec_enc);
5186         break;
5187       default: assert(false, "%s", type2name(to_elem_bt));
5188     }
5189   }
5190 }
5191 
5192 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5193                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5194                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5195   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5196   // and re-instantiate original MXCSR.RC mode after that.
5197   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5198 
5199   mov64(tmp, julong_cast(0.5L));
5200   evpbroadcastq(xtmp1, tmp, vec_enc);
5201   vaddpd(xtmp1, src , xtmp1, vec_enc);
5202   evcvtpd2qq(dst, xtmp1, vec_enc);
5203   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5204                                                 double_sign_flip, vec_enc);;
5205 
5206   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5207 }
5208 
5209 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5210                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5211                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5212   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5213   // and re-instantiate original MXCSR.RC mode after that.
5214   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5215 
5216   movl(tmp, jint_cast(0.5));
5217   movq(xtmp1, tmp);
5218   vbroadcastss(xtmp1, xtmp1, vec_enc);
5219   vaddps(xtmp1, src , xtmp1, vec_enc);
5220   vcvtps2dq(dst, xtmp1, vec_enc);
5221   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5222                                               float_sign_flip, vec_enc);
5223 
5224   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5225 }
5226 
5227 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5228                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5229                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5230   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5231   // and re-instantiate original MXCSR.RC mode after that.
5232   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5233 
5234   movl(tmp, jint_cast(0.5));
5235   movq(xtmp1, tmp);
5236   vbroadcastss(xtmp1, xtmp1, vec_enc);
5237   vaddps(xtmp1, src , xtmp1, vec_enc);
5238   vcvtps2dq(dst, xtmp1, vec_enc);
5239   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5240 
5241   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5242 }
5243 
5244 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5245                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5246   switch (from_elem_bt) {
5247     case T_BYTE:
5248       switch (to_elem_bt) {
5249         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5250         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5251         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5252         default: ShouldNotReachHere();
5253       }
5254       break;
5255     case T_SHORT:
5256       switch (to_elem_bt) {
5257         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5258         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5259         default: ShouldNotReachHere();
5260       }
5261       break;
5262     case T_INT:
5263       assert(to_elem_bt == T_LONG, "");
5264       vpmovzxdq(dst, src, vlen_enc);
5265       break;
5266     default:
5267       ShouldNotReachHere();
5268   }
5269 }
5270 
5271 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5272                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5273   switch (from_elem_bt) {
5274     case T_BYTE:
5275       switch (to_elem_bt) {
5276         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5277         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5278         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5279         default: ShouldNotReachHere();
5280       }
5281       break;
5282     case T_SHORT:
5283       switch (to_elem_bt) {
5284         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5285         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5286         default: ShouldNotReachHere();
5287       }
5288       break;
5289     case T_INT:
5290       assert(to_elem_bt == T_LONG, "");
5291       vpmovsxdq(dst, src, vlen_enc);
5292       break;
5293     default:
5294       ShouldNotReachHere();
5295   }
5296 }
5297 
5298 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5299                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5300   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5301   assert(vlen_enc != AVX_512bit, "");
5302 
5303   int dst_bt_size = type2aelembytes(dst_bt);
5304   int src_bt_size = type2aelembytes(src_bt);
5305   if (dst_bt_size > src_bt_size) {
5306     switch (dst_bt_size / src_bt_size) {
5307       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5308       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5309       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5310       default: ShouldNotReachHere();
5311     }
5312   } else {
5313     assert(dst_bt_size < src_bt_size, "");
5314     switch (src_bt_size / dst_bt_size) {
5315       case 2: {
5316         if (vlen_enc == AVX_128bit) {
5317           vpacksswb(dst, src, src, vlen_enc);
5318         } else {
5319           vpacksswb(dst, src, src, vlen_enc);
5320           vpermq(dst, dst, 0x08, vlen_enc);
5321         }
5322         break;
5323       }
5324       case 4: {
5325         if (vlen_enc == AVX_128bit) {
5326           vpackssdw(dst, src, src, vlen_enc);
5327           vpacksswb(dst, dst, dst, vlen_enc);
5328         } else {
5329           vpackssdw(dst, src, src, vlen_enc);
5330           vpermq(dst, dst, 0x08, vlen_enc);
5331           vpacksswb(dst, dst, dst, AVX_128bit);
5332         }
5333         break;
5334       }
5335       case 8: {
5336         if (vlen_enc == AVX_128bit) {
5337           vpshufd(dst, src, 0x08, vlen_enc);
5338           vpackssdw(dst, dst, dst, vlen_enc);
5339           vpacksswb(dst, dst, dst, vlen_enc);
5340         } else {
5341           vpshufd(dst, src, 0x08, vlen_enc);
5342           vpermq(dst, dst, 0x08, vlen_enc);
5343           vpackssdw(dst, dst, dst, AVX_128bit);
5344           vpacksswb(dst, dst, dst, AVX_128bit);
5345         }
5346         break;
5347       }
5348       default: ShouldNotReachHere();
5349     }
5350   }
5351 }
5352 
5353 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5354                                    bool merge, BasicType bt, int vlen_enc) {
5355   if (bt == T_INT) {
5356     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5357   } else {
5358     assert(bt == T_LONG, "");
5359     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5360   }
5361 }
5362 
5363 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5364                                    bool merge, BasicType bt, int vlen_enc) {
5365   if (bt == T_INT) {
5366     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5367   } else {
5368     assert(bt == T_LONG, "");
5369     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5370   }
5371 }
5372 
5373 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5374                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5375                                                int vec_enc) {
5376   int index = 0;
5377   int vindex = 0;
5378   mov64(rtmp1, 0x0101010101010101L);
5379   pdepq(rtmp1, src, rtmp1);
5380   if (mask_len > 8) {
5381     movq(rtmp2, src);
5382     vpxor(xtmp, xtmp, xtmp, vec_enc);
5383     movq(xtmp, rtmp1);
5384   }
5385   movq(dst, rtmp1);
5386 
5387   mask_len -= 8;
5388   while (mask_len > 0) {
5389     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5390     index++;
5391     if ((index % 2) == 0) {
5392       pxor(xtmp, xtmp);
5393     }
5394     mov64(rtmp1, 0x0101010101010101L);
5395     shrq(rtmp2, 8);
5396     pdepq(rtmp1, rtmp2, rtmp1);
5397     pinsrq(xtmp, rtmp1, index % 2);
5398     vindex = index / 2;
5399     if (vindex) {
5400       // Write entire 16 byte vector when both 64 bit
5401       // lanes are update to save redundant instructions.
5402       if (index % 2) {
5403         vinsertf128(dst, dst, xtmp, vindex);
5404       }
5405     } else {
5406       vmovdqu(dst, xtmp);
5407     }
5408     mask_len -= 8;
5409   }
5410 }
5411 
5412 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5413   switch(opc) {
5414     case Op_VectorMaskTrueCount:
5415       popcntq(dst, tmp);
5416       break;
5417     case Op_VectorMaskLastTrue:
5418       if (VM_Version::supports_lzcnt()) {
5419         lzcntq(tmp, tmp);
5420         movl(dst, 63);
5421         subl(dst, tmp);
5422       } else {
5423         movl(dst, -1);
5424         bsrq(tmp, tmp);
5425         cmov32(Assembler::notZero, dst, tmp);
5426       }
5427       break;
5428     case Op_VectorMaskFirstTrue:
5429       if (VM_Version::supports_bmi1()) {
5430         if (masklen < 32) {
5431           orl(tmp, 1 << masklen);
5432           tzcntl(dst, tmp);
5433         } else if (masklen == 32) {
5434           tzcntl(dst, tmp);
5435         } else {
5436           assert(masklen == 64, "");
5437           tzcntq(dst, tmp);
5438         }
5439       } else {
5440         if (masklen < 32) {
5441           orl(tmp, 1 << masklen);
5442           bsfl(dst, tmp);
5443         } else {
5444           assert(masklen == 32 || masklen == 64, "");
5445           movl(dst, masklen);
5446           if (masklen == 32)  {
5447             bsfl(tmp, tmp);
5448           } else {
5449             bsfq(tmp, tmp);
5450           }
5451           cmov32(Assembler::notZero, dst, tmp);
5452         }
5453       }
5454       break;
5455     case Op_VectorMaskToLong:
5456       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5457       break;
5458     default: assert(false, "Unhandled mask operation");
5459   }
5460 }
5461 
5462 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5463                                               int masklen, int masksize, int vec_enc) {
5464   assert(VM_Version::supports_popcnt(), "");
5465 
5466   if(VM_Version::supports_avx512bw()) {
5467     kmovql(tmp, mask);
5468   } else {
5469     assert(masklen <= 16, "");
5470     kmovwl(tmp, mask);
5471   }
5472 
5473   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5474   // operations needs to be clipped.
5475   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5476     andq(tmp, (1 << masklen) - 1);
5477   }
5478 
5479   vector_mask_operation_helper(opc, dst, tmp, masklen);
5480 }
5481 
5482 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5483                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5484   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5485          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5486   assert(VM_Version::supports_popcnt(), "");
5487 
5488   bool need_clip = false;
5489   switch(bt) {
5490     case T_BOOLEAN:
5491       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5492       vpxor(xtmp, xtmp, xtmp, vec_enc);
5493       vpsubb(xtmp, xtmp, mask, vec_enc);
5494       vpmovmskb(tmp, xtmp, vec_enc);
5495       need_clip = masklen < 16;
5496       break;
5497     case T_BYTE:
5498       vpmovmskb(tmp, mask, vec_enc);
5499       need_clip = masklen < 16;
5500       break;
5501     case T_SHORT:
5502       vpacksswb(xtmp, mask, mask, vec_enc);
5503       if (masklen >= 16) {
5504         vpermpd(xtmp, xtmp, 8, vec_enc);
5505       }
5506       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5507       need_clip = masklen < 16;
5508       break;
5509     case T_INT:
5510     case T_FLOAT:
5511       vmovmskps(tmp, mask, vec_enc);
5512       need_clip = masklen < 4;
5513       break;
5514     case T_LONG:
5515     case T_DOUBLE:
5516       vmovmskpd(tmp, mask, vec_enc);
5517       need_clip = masklen < 2;
5518       break;
5519     default: assert(false, "Unhandled type, %s", type2name(bt));
5520   }
5521 
5522   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5523   // operations needs to be clipped.
5524   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5525     // need_clip implies masklen < 32
5526     andq(tmp, (1 << masklen) - 1);
5527   }
5528 
5529   vector_mask_operation_helper(opc, dst, tmp, masklen);
5530 }
5531 
5532 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5533                                              Register rtmp2, int mask_len) {
5534   kmov(rtmp1, src);
5535   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5536   mov64(rtmp2, -1L);
5537   pextq(rtmp2, rtmp2, rtmp1);
5538   kmov(dst, rtmp2);
5539 }
5540 
5541 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5542                                                     XMMRegister mask, Register rtmp, Register rscratch,
5543                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5544                                                     int vec_enc) {
5545   assert(type2aelembytes(bt) >= 4, "");
5546   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5547   address compress_perm_table = nullptr;
5548   address expand_perm_table = nullptr;
5549   if (type2aelembytes(bt) == 8) {
5550     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5551     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5552     vmovmskpd(rtmp, mask, vec_enc);
5553   } else {
5554     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5555     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5556     vmovmskps(rtmp, mask, vec_enc);
5557   }
5558   shlq(rtmp, 5); // for 32 byte permute row.
5559   if (opcode == Op_CompressV) {
5560     lea(rscratch, ExternalAddress(compress_perm_table));
5561   } else {
5562     lea(rscratch, ExternalAddress(expand_perm_table));
5563   }
5564   addptr(rtmp, rscratch);
5565   vmovdqu(permv, Address(rtmp));
5566   vpermps(dst, permv, src, Assembler::AVX_256bit);
5567   vpxor(xtmp, xtmp, xtmp, vec_enc);
5568   // Blend the result with zero vector using permute mask, each column entry
5569   // in a permute table row contains either a valid permute index or a -1 (default)
5570   // value, this can potentially be used as a blending mask after
5571   // compressing/expanding the source vector lanes.
5572   vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv);
5573 }
5574 
5575 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5576                                                bool merge, BasicType bt, int vec_enc) {
5577   if (opcode == Op_CompressV) {
5578     switch(bt) {
5579     case T_BYTE:
5580       evpcompressb(dst, mask, src, merge, vec_enc);
5581       break;
5582     case T_CHAR:
5583     case T_SHORT:
5584       evpcompressw(dst, mask, src, merge, vec_enc);
5585       break;
5586     case T_INT:
5587       evpcompressd(dst, mask, src, merge, vec_enc);
5588       break;
5589     case T_FLOAT:
5590       evcompressps(dst, mask, src, merge, vec_enc);
5591       break;
5592     case T_LONG:
5593       evpcompressq(dst, mask, src, merge, vec_enc);
5594       break;
5595     case T_DOUBLE:
5596       evcompresspd(dst, mask, src, merge, vec_enc);
5597       break;
5598     default:
5599       fatal("Unsupported type %s", type2name(bt));
5600       break;
5601     }
5602   } else {
5603     assert(opcode == Op_ExpandV, "");
5604     switch(bt) {
5605     case T_BYTE:
5606       evpexpandb(dst, mask, src, merge, vec_enc);
5607       break;
5608     case T_CHAR:
5609     case T_SHORT:
5610       evpexpandw(dst, mask, src, merge, vec_enc);
5611       break;
5612     case T_INT:
5613       evpexpandd(dst, mask, src, merge, vec_enc);
5614       break;
5615     case T_FLOAT:
5616       evexpandps(dst, mask, src, merge, vec_enc);
5617       break;
5618     case T_LONG:
5619       evpexpandq(dst, mask, src, merge, vec_enc);
5620       break;
5621     case T_DOUBLE:
5622       evexpandpd(dst, mask, src, merge, vec_enc);
5623       break;
5624     default:
5625       fatal("Unsupported type %s", type2name(bt));
5626       break;
5627     }
5628   }
5629 }
5630 
5631 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5632                                            KRegister ktmp1, int vec_enc) {
5633   if (opcode == Op_SignumVD) {
5634     vsubpd(dst, zero, one, vec_enc);
5635     // if src < 0 ? -1 : 1
5636     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5637     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5638     // if src == NaN, -0.0 or 0.0 return src.
5639     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5640     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5641   } else {
5642     assert(opcode == Op_SignumVF, "");
5643     vsubps(dst, zero, one, vec_enc);
5644     // if src < 0 ? -1 : 1
5645     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5646     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5647     // if src == NaN, -0.0 or 0.0 return src.
5648     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5649     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5650   }
5651 }
5652 
5653 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5654                                           XMMRegister xtmp1, int vec_enc) {
5655   if (opcode == Op_SignumVD) {
5656     vsubpd(dst, zero, one, vec_enc);
5657     // if src < 0 ? -1 : 1
5658     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5659     // if src == NaN, -0.0 or 0.0 return src.
5660     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5661     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5662   } else {
5663     assert(opcode == Op_SignumVF, "");
5664     vsubps(dst, zero, one, vec_enc);
5665     // if src < 0 ? -1 : 1
5666     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5667     // if src == NaN, -0.0 or 0.0 return src.
5668     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5669     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5670   }
5671 }
5672 
5673 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5674   if (VM_Version::supports_avx512bw()) {
5675     if (mask_len > 32) {
5676       kmovql(dst, src);
5677     } else {
5678       kmovdl(dst, src);
5679       if (mask_len != 32) {
5680         kshiftrdl(dst, dst, 32 - mask_len);
5681       }
5682     }
5683   } else {
5684     assert(mask_len <= 16, "");
5685     kmovwl(dst, src);
5686     if (mask_len != 16) {
5687       kshiftrwl(dst, dst, 16 - mask_len);
5688     }
5689   }
5690 }
5691 
5692 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5693   int lane_size = type2aelembytes(bt);
5694   if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5695       (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) {
5696     movptr(rtmp, imm32);
5697     switch(lane_size) {
5698       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5699       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5700       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5701       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5702       fatal("Unsupported lane size %d", lane_size);
5703       break;
5704     }
5705   } else {
5706     movptr(rtmp, imm32);
5707     movq(dst, rtmp);
5708     switch(lane_size) {
5709       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5710       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5711       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5712       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5713       fatal("Unsupported lane size %d", lane_size);
5714       break;
5715     }
5716   }
5717 }
5718 
5719 //
5720 // Following is lookup table based popcount computation algorithm:-
5721 //       Index   Bit set count
5722 //     [ 0000 ->   0,
5723 //       0001 ->   1,
5724 //       0010 ->   1,
5725 //       0011 ->   2,
5726 //       0100 ->   1,
5727 //       0101 ->   2,
5728 //       0110 ->   2,
5729 //       0111 ->   3,
5730 //       1000 ->   1,
5731 //       1001 ->   2,
5732 //       1010 ->   3,
5733 //       1011 ->   3,
5734 //       1100 ->   2,
5735 //       1101 ->   3,
5736 //       1111 ->   4 ]
5737 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5738 //     shuffle indices for lookup table access.
5739 //  b. Right shift each byte of vector lane by 4 positions.
5740 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5741 //     shuffle indices for lookup table access.
5742 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5743 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5744 //     count of all the bytes of a quadword.
5745 //  f. Perform step e. for upper 128bit vector lane.
5746 //  g. Pack the bitset count of quadwords back to double word.
5747 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5748 
5749 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5750                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5751   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5752   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5753   vpsrlw(dst, src, 4, vec_enc);
5754   vpand(dst, dst, xtmp1, vec_enc);
5755   vpand(xtmp1, src, xtmp1, vec_enc);
5756   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5757   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5758   vpshufb(dst, xtmp2, dst, vec_enc);
5759   vpaddb(dst, dst, xtmp1, vec_enc);
5760 }
5761 
5762 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5763                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5764   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5765   // Following code is as per steps e,f,g and h of above algorithm.
5766   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5767   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5768   vpsadbw(dst, dst, xtmp2, vec_enc);
5769   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5770   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5771   vpackuswb(dst, xtmp1, dst, vec_enc);
5772 }
5773 
5774 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5775                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5776   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5777   // Add the popcount of upper and lower bytes of word.
5778   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5779   vpsrlw(dst, xtmp1, 8, vec_enc);
5780   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5781   vpaddw(dst, dst, xtmp1, vec_enc);
5782 }
5783 
5784 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5785                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5786   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5787   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5788   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5789 }
5790 
5791 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5792                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5793   switch(bt) {
5794     case T_LONG:
5795       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5796       break;
5797     case T_INT:
5798       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5799       break;
5800     case T_CHAR:
5801     case T_SHORT:
5802       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5803       break;
5804     case T_BYTE:
5805     case T_BOOLEAN:
5806       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5807       break;
5808     default:
5809       fatal("Unsupported type %s", type2name(bt));
5810       break;
5811   }
5812 }
5813 
5814 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5815                                                       KRegister mask, bool merge, int vec_enc) {
5816   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5817   switch(bt) {
5818     case T_LONG:
5819       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5820       evpopcntq(dst, mask, src, merge, vec_enc);
5821       break;
5822     case T_INT:
5823       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5824       evpopcntd(dst, mask, src, merge, vec_enc);
5825       break;
5826     case T_CHAR:
5827     case T_SHORT:
5828       assert(VM_Version::supports_avx512_bitalg(), "");
5829       evpopcntw(dst, mask, src, merge, vec_enc);
5830       break;
5831     case T_BYTE:
5832     case T_BOOLEAN:
5833       assert(VM_Version::supports_avx512_bitalg(), "");
5834       evpopcntb(dst, mask, src, merge, vec_enc);
5835       break;
5836     default:
5837       fatal("Unsupported type %s", type2name(bt));
5838       break;
5839   }
5840 }
5841 
5842 // Bit reversal algorithm first reverses the bits of each byte followed by
5843 // a byte level reversal for multi-byte primitive types (short/int/long).
5844 // Algorithm performs a lookup table access to get reverse bit sequence
5845 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5846 // is obtained by swapping the reverse bit sequences of upper and lower
5847 // nibble of a byte.
5848 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5849                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5850   if (VM_Version::supports_avx512vlbw()) {
5851 
5852     // Get the reverse bit sequence of lower nibble of each byte.
5853     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5854     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5855     evpandq(dst, xtmp2, src, vec_enc);
5856     vpshufb(dst, xtmp1, dst, vec_enc);
5857     vpsllq(dst, dst, 4, vec_enc);
5858 
5859     // Get the reverse bit sequence of upper nibble of each byte.
5860     vpandn(xtmp2, xtmp2, src, vec_enc);
5861     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5862     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5863 
5864     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5865     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5866     evporq(xtmp2, dst, xtmp2, vec_enc);
5867     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5868 
5869   } else if(vec_enc == Assembler::AVX_512bit) {
5870     // Shift based bit reversal.
5871     assert(bt == T_LONG || bt == T_INT, "");
5872 
5873     // Swap lower and upper nibble of each byte.
5874     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5875 
5876     // Swap two least and most significant bits of each nibble.
5877     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5878 
5879     // Swap adjacent pair of bits.
5880     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5881     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5882 
5883     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5884     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5885   } else {
5886     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5887     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5888 
5889     // Get the reverse bit sequence of lower nibble of each byte.
5890     vpand(dst, xtmp2, src, vec_enc);
5891     vpshufb(dst, xtmp1, dst, vec_enc);
5892     vpsllq(dst, dst, 4, vec_enc);
5893 
5894     // Get the reverse bit sequence of upper nibble of each byte.
5895     vpandn(xtmp2, xtmp2, src, vec_enc);
5896     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5897     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5898 
5899     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5900     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5901     vpor(xtmp2, dst, xtmp2, vec_enc);
5902     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5903   }
5904 }
5905 
5906 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5907                                                 XMMRegister xtmp, Register rscratch) {
5908   assert(VM_Version::supports_gfni(), "");
5909   assert(rscratch != noreg || always_reachable(mask), "missing");
5910 
5911   // Galois field instruction based bit reversal based on following algorithm.
5912   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5913   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5914   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5915   vector_reverse_byte(bt, dst, xtmp, vec_enc);
5916 }
5917 
5918 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5919                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
5920   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5921   evpandq(dst, xtmp1, src, vec_enc);
5922   vpsllq(dst, dst, nbits, vec_enc);
5923   vpandn(xtmp1, xtmp1, src, vec_enc);
5924   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
5925   evporq(dst, dst, xtmp1, vec_enc);
5926 }
5927 
5928 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5929                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5930   // Shift based bit reversal.
5931   assert(VM_Version::supports_evex(), "");
5932   switch(bt) {
5933     case T_LONG:
5934       // Swap upper and lower double word of each quad word.
5935       evprorq(xtmp1, k0, src, 32, true, vec_enc);
5936       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
5937       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5938       break;
5939     case T_INT:
5940       // Swap upper and lower word of each double word.
5941       evprord(xtmp1, k0, src, 16, true, vec_enc);
5942       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5943       break;
5944     case T_CHAR:
5945     case T_SHORT:
5946       // Swap upper and lower byte of each word.
5947       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
5948       break;
5949     case T_BYTE:
5950       evmovdquq(dst, k0, src, true, vec_enc);
5951       break;
5952     default:
5953       fatal("Unsupported type %s", type2name(bt));
5954       break;
5955   }
5956 }
5957 
5958 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5959   if (bt == T_BYTE) {
5960     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
5961       evmovdquq(dst, k0, src, true, vec_enc);
5962     } else {
5963       vmovdqu(dst, src);
5964     }
5965     return;
5966   }
5967   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
5968   // pre-computed shuffle indices.
5969   switch(bt) {
5970     case T_LONG:
5971       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
5972       break;
5973     case T_INT:
5974       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
5975       break;
5976     case T_CHAR:
5977     case T_SHORT:
5978       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
5979       break;
5980     default:
5981       fatal("Unsupported type %s", type2name(bt));
5982       break;
5983   }
5984   vpshufb(dst, src, dst, vec_enc);
5985 }
5986 
5987 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5988                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5989                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
5990   assert(is_integral_type(bt), "");
5991   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5992   assert(VM_Version::supports_avx512cd(), "");
5993   switch(bt) {
5994     case T_LONG:
5995       evplzcntq(dst, ktmp, src, merge, vec_enc);
5996       break;
5997     case T_INT:
5998       evplzcntd(dst, ktmp, src, merge, vec_enc);
5999       break;
6000     case T_SHORT:
6001       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6002       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6003       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6004       vpunpckhwd(dst, xtmp1, src, vec_enc);
6005       evplzcntd(dst, ktmp, dst, merge, vec_enc);
6006       vpackusdw(dst, xtmp2, dst, vec_enc);
6007       break;
6008     case T_BYTE:
6009       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6010       // accessing the lookup table.
6011       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6012       // accessing the lookup table.
6013       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6014       assert(VM_Version::supports_avx512bw(), "");
6015       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6016       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6017       vpand(xtmp2, dst, src, vec_enc);
6018       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6019       vpsrlw(xtmp3, src, 4, vec_enc);
6020       vpand(xtmp3, dst, xtmp3, vec_enc);
6021       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6022       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6023       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6024       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6025       break;
6026     default:
6027       fatal("Unsupported type %s", type2name(bt));
6028       break;
6029   }
6030 }
6031 
6032 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6033                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6034   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6035   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6036   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6037   // accessing the lookup table.
6038   vpand(dst, xtmp2, src, vec_enc);
6039   vpshufb(dst, xtmp1, dst, vec_enc);
6040   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6041   // accessing the lookup table.
6042   vpsrlw(xtmp3, src, 4, vec_enc);
6043   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6044   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6045   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6046   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6047   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6048   vpaddb(dst, dst, xtmp2, vec_enc);
6049   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6050 }
6051 
6052 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6053                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6054   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6055   // Add zero counts of lower byte and upper byte of a word if
6056   // upper byte holds a zero value.
6057   vpsrlw(xtmp3, src, 8, vec_enc);
6058   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6059   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6060   vpsllw(xtmp2, dst, 8, vec_enc);
6061   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6062   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6063   vpsrlw(dst, dst, 8, vec_enc);
6064 }
6065 
6066 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6067                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6068   // Since IEEE 754 floating point format represents mantissa in 1.0 format
6069   // hence biased exponent can be used to compute leading zero count as per
6070   // following formula:-
6071   // LZCNT = 31 - (biased_exp - 127)
6072   // Special handling has been introduced for Zero, Max_Int and -ve source values.
6073 
6074   // Broadcast 0xFF
6075   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6076   vpsrld(xtmp1, xtmp1, 24, vec_enc);
6077 
6078   // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher
6079   // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit
6080   // contributes to the leading number of zeros.
6081   vpsrld(xtmp2, src, 1, vec_enc);
6082   vpandn(xtmp3, xtmp2, src, vec_enc);
6083 
6084   // Extract biased exponent.
6085   vcvtdq2ps(dst, xtmp3, vec_enc);
6086   vpsrld(dst, dst, 23, vec_enc);
6087   vpand(dst, dst, xtmp1, vec_enc);
6088 
6089   // Broadcast 127.
6090   vpsrld(xtmp1, xtmp1, 1, vec_enc);
6091   // Exponent = biased_exp - 127
6092   vpsubd(dst, dst, xtmp1, vec_enc);
6093 
6094   // Exponent_plus_one = Exponent + 1
6095   vpsrld(xtmp3, xtmp1, 6, vec_enc);
6096   vpaddd(dst, dst, xtmp3, vec_enc);
6097 
6098   // Replace -ve exponent with zero, exponent is -ve when src
6099   // lane contains a zero value.
6100   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6101   vblendvps(dst, dst, xtmp2, dst, vec_enc);
6102 
6103   // Rematerialize broadcast 32.
6104   vpslld(xtmp1, xtmp3, 5, vec_enc);
6105   // Exponent is 32 if corresponding source lane contains max_int value.
6106   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
6107   // LZCNT = 32 - exponent_plus_one
6108   vpsubd(dst, xtmp1, dst, vec_enc);
6109 
6110   // Replace LZCNT with a value 1 if corresponding source lane
6111   // contains max_int value.
6112   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
6113 
6114   // Replace biased_exp with 0 if source lane value is less than zero.
6115   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6116   vblendvps(dst, dst, xtmp2, src, vec_enc);
6117 }
6118 
6119 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6120                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6121   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6122   // Add zero counts of lower word and upper word of a double word if
6123   // upper word holds a zero value.
6124   vpsrld(xtmp3, src, 16, vec_enc);
6125   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6126   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
6127   vpslld(xtmp2, dst, 16, vec_enc);
6128   vpaddd(xtmp2, xtmp2, dst, vec_enc);
6129   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6130   vpsrld(dst, dst, 16, vec_enc);
6131   // Add zero counts of lower doubleword and upper doubleword of a
6132   // quadword if upper doubleword holds a zero value.
6133   vpsrlq(xtmp3, src, 32, vec_enc);
6134   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
6135   vpsllq(xtmp2, dst, 32, vec_enc);
6136   vpaddq(xtmp2, xtmp2, dst, vec_enc);
6137   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6138   vpsrlq(dst, dst, 32, vec_enc);
6139 }
6140 
6141 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6142                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6143                                                        Register rtmp, int vec_enc) {
6144   assert(is_integral_type(bt), "unexpected type");
6145   assert(vec_enc < Assembler::AVX_512bit, "");
6146   switch(bt) {
6147     case T_LONG:
6148       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6149       break;
6150     case T_INT:
6151       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6152       break;
6153     case T_SHORT:
6154       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6155       break;
6156     case T_BYTE:
6157       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6158       break;
6159     default:
6160       fatal("Unsupported type %s", type2name(bt));
6161       break;
6162   }
6163 }
6164 
6165 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6166   switch(bt) {
6167     case T_BYTE:
6168       vpsubb(dst, src1, src2, vec_enc);
6169       break;
6170     case T_SHORT:
6171       vpsubw(dst, src1, src2, vec_enc);
6172       break;
6173     case T_INT:
6174       vpsubd(dst, src1, src2, vec_enc);
6175       break;
6176     case T_LONG:
6177       vpsubq(dst, src1, src2, vec_enc);
6178       break;
6179     default:
6180       fatal("Unsupported type %s", type2name(bt));
6181       break;
6182   }
6183 }
6184 
6185 // Trailing zero count computation is based on leading zero count operation as per
6186 // following equation. All AVX3 targets support AVX512CD feature which offers
6187 // direct vector instruction to compute leading zero count.
6188 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6189 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6190                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6191                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6192   assert(is_integral_type(bt), "");
6193   // xtmp = -1
6194   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6195   // xtmp = xtmp + src
6196   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6197   // xtmp = xtmp & ~src
6198   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6199   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6200   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6201   vpsub(bt, dst, xtmp4, dst, vec_enc);
6202 }
6203 
6204 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6205 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6206 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6207                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6208   assert(is_integral_type(bt), "");
6209   // xtmp = 0
6210   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6211   // xtmp = 0 - src
6212   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6213   // xtmp = xtmp | src
6214   vpor(xtmp3, xtmp3, src, vec_enc);
6215   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6216   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6217   vpsub(bt, dst, xtmp1, dst, vec_enc);
6218 }
6219 
6220 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6221   Label done;
6222   Label neg_divisor_fastpath;
6223   cmpl(divisor, 0);
6224   jccb(Assembler::less, neg_divisor_fastpath);
6225   xorl(rdx, rdx);
6226   divl(divisor);
6227   jmpb(done);
6228   bind(neg_divisor_fastpath);
6229   // Fastpath for divisor < 0:
6230   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6231   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6232   movl(rdx, rax);
6233   subl(rdx, divisor);
6234   if (VM_Version::supports_bmi1()) {
6235     andnl(rax, rdx, rax);
6236   } else {
6237     notl(rdx);
6238     andl(rax, rdx);
6239   }
6240   shrl(rax, 31);
6241   bind(done);
6242 }
6243 
6244 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6245   Label done;
6246   Label neg_divisor_fastpath;
6247   cmpl(divisor, 0);
6248   jccb(Assembler::less, neg_divisor_fastpath);
6249   xorl(rdx, rdx);
6250   divl(divisor);
6251   jmpb(done);
6252   bind(neg_divisor_fastpath);
6253   // Fastpath when divisor < 0:
6254   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6255   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6256   movl(rdx, rax);
6257   subl(rax, divisor);
6258   if (VM_Version::supports_bmi1()) {
6259     andnl(rax, rax, rdx);
6260   } else {
6261     notl(rax);
6262     andl(rax, rdx);
6263   }
6264   sarl(rax, 31);
6265   andl(rax, divisor);
6266   subl(rdx, rax);
6267   bind(done);
6268 }
6269 
6270 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6271   Label done;
6272   Label neg_divisor_fastpath;
6273 
6274   cmpl(divisor, 0);
6275   jccb(Assembler::less, neg_divisor_fastpath);
6276   xorl(rdx, rdx);
6277   divl(divisor);
6278   jmpb(done);
6279   bind(neg_divisor_fastpath);
6280   // Fastpath for divisor < 0:
6281   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6282   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6283   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6284   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6285   movl(rdx, rax);
6286   subl(rax, divisor);
6287   if (VM_Version::supports_bmi1()) {
6288     andnl(rax, rax, rdx);
6289   } else {
6290     notl(rax);
6291     andl(rax, rdx);
6292   }
6293   movl(tmp, rax);
6294   shrl(rax, 31); // quotient
6295   sarl(tmp, 31);
6296   andl(tmp, divisor);
6297   subl(rdx, tmp); // remainder
6298   bind(done);
6299 }
6300 
6301 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6302                                  XMMRegister xtmp2, Register rtmp) {
6303   if(VM_Version::supports_gfni()) {
6304     // Galois field instruction based bit reversal based on following algorithm.
6305     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6306     mov64(rtmp, 0x8040201008040201L);
6307     movq(xtmp1, src);
6308     movq(xtmp2, rtmp);
6309     gf2p8affineqb(xtmp1, xtmp2, 0);
6310     movq(dst, xtmp1);
6311   } else {
6312     // Swap even and odd numbered bits.
6313     movl(rtmp, src);
6314     andl(rtmp, 0x55555555);
6315     shll(rtmp, 1);
6316     movl(dst, src);
6317     andl(dst, 0xAAAAAAAA);
6318     shrl(dst, 1);
6319     orl(dst, rtmp);
6320 
6321     // Swap LSB and MSB 2 bits of each nibble.
6322     movl(rtmp, dst);
6323     andl(rtmp, 0x33333333);
6324     shll(rtmp, 2);
6325     andl(dst, 0xCCCCCCCC);
6326     shrl(dst, 2);
6327     orl(dst, rtmp);
6328 
6329     // Swap LSB and MSB 4 bits of each byte.
6330     movl(rtmp, dst);
6331     andl(rtmp, 0x0F0F0F0F);
6332     shll(rtmp, 4);
6333     andl(dst, 0xF0F0F0F0);
6334     shrl(dst, 4);
6335     orl(dst, rtmp);
6336   }
6337   bswapl(dst);
6338 }
6339 
6340 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6341                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6342   if(VM_Version::supports_gfni()) {
6343     // Galois field instruction based bit reversal based on following algorithm.
6344     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6345     mov64(rtmp1, 0x8040201008040201L);
6346     movq(xtmp1, src);
6347     movq(xtmp2, rtmp1);
6348     gf2p8affineqb(xtmp1, xtmp2, 0);
6349     movq(dst, xtmp1);
6350   } else {
6351     // Swap even and odd numbered bits.
6352     movq(rtmp1, src);
6353     mov64(rtmp2, 0x5555555555555555L);
6354     andq(rtmp1, rtmp2);
6355     shlq(rtmp1, 1);
6356     movq(dst, src);
6357     notq(rtmp2);
6358     andq(dst, rtmp2);
6359     shrq(dst, 1);
6360     orq(dst, rtmp1);
6361 
6362     // Swap LSB and MSB 2 bits of each nibble.
6363     movq(rtmp1, dst);
6364     mov64(rtmp2, 0x3333333333333333L);
6365     andq(rtmp1, rtmp2);
6366     shlq(rtmp1, 2);
6367     notq(rtmp2);
6368     andq(dst, rtmp2);
6369     shrq(dst, 2);
6370     orq(dst, rtmp1);
6371 
6372     // Swap LSB and MSB 4 bits of each byte.
6373     movq(rtmp1, dst);
6374     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6375     andq(rtmp1, rtmp2);
6376     shlq(rtmp1, 4);
6377     notq(rtmp2);
6378     andq(dst, rtmp2);
6379     shrq(dst, 4);
6380     orq(dst, rtmp1);
6381   }
6382   bswapq(dst);
6383 }
6384 
6385 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6386   Label done;
6387   Label neg_divisor_fastpath;
6388   cmpq(divisor, 0);
6389   jccb(Assembler::less, neg_divisor_fastpath);
6390   xorl(rdx, rdx);
6391   divq(divisor);
6392   jmpb(done);
6393   bind(neg_divisor_fastpath);
6394   // Fastpath for divisor < 0:
6395   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6396   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6397   movq(rdx, rax);
6398   subq(rdx, divisor);
6399   if (VM_Version::supports_bmi1()) {
6400     andnq(rax, rdx, rax);
6401   } else {
6402     notq(rdx);
6403     andq(rax, rdx);
6404   }
6405   shrq(rax, 63);
6406   bind(done);
6407 }
6408 
6409 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6410   Label done;
6411   Label neg_divisor_fastpath;
6412   cmpq(divisor, 0);
6413   jccb(Assembler::less, neg_divisor_fastpath);
6414   xorq(rdx, rdx);
6415   divq(divisor);
6416   jmp(done);
6417   bind(neg_divisor_fastpath);
6418   // Fastpath when divisor < 0:
6419   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6420   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6421   movq(rdx, rax);
6422   subq(rax, divisor);
6423   if (VM_Version::supports_bmi1()) {
6424     andnq(rax, rax, rdx);
6425   } else {
6426     notq(rax);
6427     andq(rax, rdx);
6428   }
6429   sarq(rax, 63);
6430   andq(rax, divisor);
6431   subq(rdx, rax);
6432   bind(done);
6433 }
6434 
6435 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6436   Label done;
6437   Label neg_divisor_fastpath;
6438   cmpq(divisor, 0);
6439   jccb(Assembler::less, neg_divisor_fastpath);
6440   xorq(rdx, rdx);
6441   divq(divisor);
6442   jmp(done);
6443   bind(neg_divisor_fastpath);
6444   // Fastpath for divisor < 0:
6445   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6446   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6447   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6448   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6449   movq(rdx, rax);
6450   subq(rax, divisor);
6451   if (VM_Version::supports_bmi1()) {
6452     andnq(rax, rax, rdx);
6453   } else {
6454     notq(rax);
6455     andq(rax, rdx);
6456   }
6457   movq(tmp, rax);
6458   shrq(rax, 63); // quotient
6459   sarq(tmp, 63);
6460   andq(tmp, divisor);
6461   subq(rdx, tmp); // remainder
6462   bind(done);
6463 }
6464 
6465 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6466                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6467                                         int vlen_enc) {
6468   assert(VM_Version::supports_avx512bw(), "");
6469   // Byte shuffles are inlane operations and indices are determined using
6470   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6471   // normalized to index range 0-15. This makes sure that all the multiples
6472   // of an index value are placed at same relative position in 128 bit
6473   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6474   // will be 16th element in their respective 128 bit lanes.
6475   movl(rtmp, 16);
6476   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6477 
6478   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6479   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6480   // original shuffle indices and move the shuffled lanes corresponding to true
6481   // mask to destination vector.
6482   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6483   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6484   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6485 
6486   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6487   // and broadcasting second 128 bit lane.
6488   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6489   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6490   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6491   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6492   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6493 
6494   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6495   // and broadcasting third 128 bit lane.
6496   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6497   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6498   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6499   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6500   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6501 
6502   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6503   // and broadcasting third 128 bit lane.
6504   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6505   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6506   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6507   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6508   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6509 }
6510 
6511 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6512                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6513   if (vlen_enc == AVX_128bit) {
6514     vpermilps(dst, src, shuffle, vlen_enc);
6515   } else if (bt == T_INT) {
6516     vpermd(dst, shuffle, src, vlen_enc);
6517   } else {
6518     assert(bt == T_FLOAT, "");
6519     vpermps(dst, shuffle, src, vlen_enc);
6520   }
6521 }
6522 
6523 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6524   switch(opcode) {
6525     case Op_AddHF: vaddsh(dst, src1, src2); break;
6526     case Op_SubHF: vsubsh(dst, src1, src2); break;
6527     case Op_MulHF: vmulsh(dst, src1, src2); break;
6528     case Op_DivHF: vdivsh(dst, src1, src2); break;
6529     default: assert(false, "%s", NodeClassNames[opcode]); break;
6530   }
6531 }
6532 
6533 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6534   switch(elem_bt) {
6535     case T_BYTE:
6536       if (ideal_opc == Op_SaturatingAddV) {
6537         vpaddsb(dst, src1, src2, vlen_enc);
6538       } else {
6539         assert(ideal_opc == Op_SaturatingSubV, "");
6540         vpsubsb(dst, src1, src2, vlen_enc);
6541       }
6542       break;
6543     case T_SHORT:
6544       if (ideal_opc == Op_SaturatingAddV) {
6545         vpaddsw(dst, src1, src2, vlen_enc);
6546       } else {
6547         assert(ideal_opc == Op_SaturatingSubV, "");
6548         vpsubsw(dst, src1, src2, vlen_enc);
6549       }
6550       break;
6551     default:
6552       fatal("Unsupported type %s", type2name(elem_bt));
6553       break;
6554   }
6555 }
6556 
6557 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6558   switch(elem_bt) {
6559     case T_BYTE:
6560       if (ideal_opc == Op_SaturatingAddV) {
6561         vpaddusb(dst, src1, src2, vlen_enc);
6562       } else {
6563         assert(ideal_opc == Op_SaturatingSubV, "");
6564         vpsubusb(dst, src1, src2, vlen_enc);
6565       }
6566       break;
6567     case T_SHORT:
6568       if (ideal_opc == Op_SaturatingAddV) {
6569         vpaddusw(dst, src1, src2, vlen_enc);
6570       } else {
6571         assert(ideal_opc == Op_SaturatingSubV, "");
6572         vpsubusw(dst, src1, src2, vlen_enc);
6573       }
6574       break;
6575     default:
6576       fatal("Unsupported type %s", type2name(elem_bt));
6577       break;
6578   }
6579 }
6580 
6581 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6582                                                               XMMRegister src2, KRegister ktmp, int vlen_enc) {
6583   // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6584   // overflow_mask = Inp1 <u Inp2
6585   evpcmpu(elem_bt, ktmp,  src2, src1, Assembler::lt, vlen_enc);
6586   // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6587   evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6588 }
6589 
6590 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6591                                                               XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6592   // Emulate unsigned comparison using signed comparison
6593   // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6594   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6595   vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6596   vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6597 
6598   vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6599 
6600   // Res = INP1 - INP2 (non-commutative and non-associative)
6601   vpsub(elem_bt, dst, src1, src2, vlen_enc);
6602   // Res = Mask ? Zero : Res
6603   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6604   vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6605 }
6606 
6607 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6608                                                                XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6609   // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6610   // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6611   // Res = Signed Add INP1, INP2
6612   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6613   // T1 = SRC1 | SRC2
6614   vpor(xtmp1, src1, src2, vlen_enc);
6615   // Max_Unsigned = -1
6616   vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6617   // Unsigned compare:  Mask = Res <u T1
6618   evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6619   // res  = Mask ? Max_Unsigned : Res
6620   evpblend(elem_bt, dst, ktmp,  dst, xtmp2, true, vlen_enc);
6621 }
6622 
6623 //
6624 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6625 // unsigned addition operation.
6626 //    overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6627 //
6628 // We empirically determined its semantic equivalence to following reduced expression
6629 //    overflow_mask =  (a + b) <u (a | b)
6630 //
6631 // and also verified it though Alive2 solver.
6632 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6633 //
6634 
6635 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6636                                                               XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6637   // Res = Signed Add INP1, INP2
6638   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6639   // Compute T1 = INP1 | INP2
6640   vpor(xtmp3, src1, src2, vlen_enc);
6641   // T1 = Minimum signed value.
6642   vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6643   // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6644   vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6645   // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6646   vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6647   // Compute overflow detection mask = Res<1> <s T1
6648   if (elem_bt == T_INT) {
6649     vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6650   } else {
6651     assert(elem_bt == T_LONG, "");
6652     vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6653   }
6654   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6655 }
6656 
6657 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6658                                       int vlen_enc, bool xtmp2_hold_M1) {
6659   if (VM_Version::supports_avx512dq()) {
6660     evpmovq2m(ktmp, src, vlen_enc);
6661   } else {
6662     assert(VM_Version::supports_evex(), "");
6663     if (!xtmp2_hold_M1) {
6664       vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6665     }
6666     evpsraq(xtmp1, src, 63, vlen_enc);
6667     evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6668   }
6669 }
6670 
6671 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6672                                       int vlen_enc, bool xtmp2_hold_M1) {
6673   if (VM_Version::supports_avx512dq()) {
6674     evpmovd2m(ktmp, src, vlen_enc);
6675   } else {
6676     assert(VM_Version::supports_evex(), "");
6677     if (!xtmp2_hold_M1) {
6678       vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6679     }
6680     vpsrad(xtmp1, src, 31, vlen_enc);
6681     Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6682   }
6683 }
6684 
6685 
6686 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6687   if (elem_bt == T_LONG) {
6688     if (VM_Version::supports_evex()) {
6689       evpsraq(dst, src, 63, vlen_enc);
6690     } else {
6691       vpsrad(dst, src, 31, vlen_enc);
6692       vpshufd(dst, dst, 0xF5, vlen_enc);
6693     }
6694   } else {
6695     assert(elem_bt == T_INT, "");
6696     vpsrad(dst, src, 31, vlen_enc);
6697   }
6698 }
6699 
6700 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6701   if (compute_allones) {
6702     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6703       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6704     } else {
6705       vpcmpeqq(allones, allones, allones, vlen_enc);
6706     }
6707   }
6708   if (elem_bt == T_LONG) {
6709     vpsrlq(dst, allones, 1, vlen_enc);
6710   } else {
6711     assert(elem_bt == T_INT, "");
6712     vpsrld(dst, allones, 1, vlen_enc);
6713   }
6714 }
6715 
6716 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6717   if (compute_allones) {
6718     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6719       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6720     } else {
6721       vpcmpeqq(allones, allones, allones, vlen_enc);
6722     }
6723   }
6724   if (elem_bt == T_LONG) {
6725     vpsllq(dst, allones, 63, vlen_enc);
6726   } else {
6727     assert(elem_bt == T_INT, "");
6728     vpslld(dst, allones, 31, vlen_enc);
6729   }
6730 }
6731 
6732 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask,  XMMRegister src1, XMMRegister src2,
6733                                 Assembler::ComparisonPredicate cond, int vlen_enc) {
6734   switch(elem_bt) {
6735     case T_LONG:  evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6736     case T_INT:   evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6737     case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6738     case T_BYTE:  evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6739     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6740   }
6741 }
6742 
6743 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6744   switch(elem_bt) {
6745     case  T_LONG:  vpcmpgtq(dst, src1, src2, vlen_enc); break;
6746     case  T_INT:   vpcmpgtd(dst, src1, src2, vlen_enc); break;
6747     case  T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6748     case  T_BYTE:  vpcmpgtb(dst, src1, src2, vlen_enc); break;
6749     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6750   }
6751 }
6752 
6753 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6754                                            XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6755   if (elem_bt == T_LONG) {
6756     evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6757   } else {
6758     assert(elem_bt == T_INT, "");
6759     evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6760   }
6761 }
6762 
6763 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6764                                                          XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6765                                                          KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6766   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6767   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6768   // Overflow detection based on Hacker's delight section 2-13.
6769   if (ideal_opc == Op_SaturatingAddV) {
6770     // res = src1 + src2
6771     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6772     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6773     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6774     vpxor(xtmp1, dst, src1, vlen_enc);
6775     vpxor(xtmp2, dst, src2, vlen_enc);
6776     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6777   } else {
6778     assert(ideal_opc == Op_SaturatingSubV, "");
6779     // res = src1 - src2
6780     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6781     // Overflow occurs when both inputs have opposite polarity and
6782     // result polarity does not comply with first input polarity.
6783     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6784     vpxor(xtmp1, src1, src2, vlen_enc);
6785     vpxor(xtmp2, dst, src1, vlen_enc);
6786     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6787   }
6788 
6789   // Compute overflow detection mask.
6790   evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6791   // Note: xtmp1 hold -1 in all its lanes after above call.
6792 
6793   // Compute mask based on first input polarity.
6794   evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6795 
6796   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6797   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6798 
6799   // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6800   // set bits in first input polarity mask holds a min value.
6801   evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6802   // Blend destination lanes with saturated values using overflow detection mask.
6803   evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6804 }
6805 
6806 
6807 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6808                                                         XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6809                                                         XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6810   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6811   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6812   // Overflow detection based on Hacker's delight section 2-13.
6813   if (ideal_opc == Op_SaturatingAddV) {
6814     // res = src1 + src2
6815     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6816     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6817     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6818     vpxor(xtmp1, dst, src1, vlen_enc);
6819     vpxor(xtmp2, dst, src2, vlen_enc);
6820     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6821   } else {
6822     assert(ideal_opc == Op_SaturatingSubV, "");
6823     // res = src1 - src2
6824     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6825     // Overflow occurs when both inputs have opposite polarity and
6826     // result polarity does not comply with first input polarity.
6827     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6828     vpxor(xtmp1, src1, src2, vlen_enc);
6829     vpxor(xtmp2, dst, src1, vlen_enc);
6830     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6831   }
6832 
6833   // Sign-extend to compute overflow detection mask.
6834   vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
6835 
6836   vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
6837   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
6838   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6839 
6840   // Compose saturating min/max vector using first input polarity mask.
6841   vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
6842   vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
6843 
6844   // Blend result with saturating vector using overflow detection mask.
6845   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6846 }
6847 
6848 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6849   switch(elem_bt) {
6850     case T_BYTE:
6851       if (ideal_opc == Op_SaturatingAddV) {
6852         vpaddsb(dst, src1, src2, vlen_enc);
6853       } else {
6854         assert(ideal_opc == Op_SaturatingSubV, "");
6855         vpsubsb(dst, src1, src2, vlen_enc);
6856       }
6857       break;
6858     case T_SHORT:
6859       if (ideal_opc == Op_SaturatingAddV) {
6860         vpaddsw(dst, src1, src2, vlen_enc);
6861       } else {
6862         assert(ideal_opc == Op_SaturatingSubV, "");
6863         vpsubsw(dst, src1, src2, vlen_enc);
6864       }
6865       break;
6866     default:
6867       fatal("Unsupported type %s", type2name(elem_bt));
6868       break;
6869   }
6870 }
6871 
6872 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6873   switch(elem_bt) {
6874     case T_BYTE:
6875       if (ideal_opc == Op_SaturatingAddV) {
6876         vpaddusb(dst, src1, src2, vlen_enc);
6877       } else {
6878         assert(ideal_opc == Op_SaturatingSubV, "");
6879         vpsubusb(dst, src1, src2, vlen_enc);
6880       }
6881       break;
6882     case T_SHORT:
6883       if (ideal_opc == Op_SaturatingAddV) {
6884         vpaddusw(dst, src1, src2, vlen_enc);
6885       } else {
6886         assert(ideal_opc == Op_SaturatingSubV, "");
6887         vpsubusw(dst, src1, src2, vlen_enc);
6888       }
6889       break;
6890     default:
6891       fatal("Unsupported type %s", type2name(elem_bt));
6892       break;
6893   }
6894 }
6895 
6896 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6897                                                      XMMRegister src2, int vlen_enc) {
6898   switch(elem_bt) {
6899     case T_BYTE:
6900       evpermi2b(dst, src1, src2, vlen_enc);
6901       break;
6902     case T_SHORT:
6903       evpermi2w(dst, src1, src2, vlen_enc);
6904       break;
6905     case T_INT:
6906       evpermi2d(dst, src1, src2, vlen_enc);
6907       break;
6908     case T_LONG:
6909       evpermi2q(dst, src1, src2, vlen_enc);
6910       break;
6911     case T_FLOAT:
6912       evpermi2ps(dst, src1, src2, vlen_enc);
6913       break;
6914     case T_DOUBLE:
6915       evpermi2pd(dst, src1, src2, vlen_enc);
6916       break;
6917     default:
6918       fatal("Unsupported type %s", type2name(elem_bt));
6919       break;
6920   }
6921 }
6922 
6923 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
6924   if (is_unsigned) {
6925     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6926   } else {
6927     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6928   }
6929 }
6930 
6931 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
6932   if (is_unsigned) {
6933     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6934   } else {
6935     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6936   }
6937 }
6938 
6939 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6940   switch(opcode) {
6941     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
6942     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
6943     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
6944     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
6945     default: assert(false, "%s", NodeClassNames[opcode]); break;
6946   }
6947 }
6948 
6949 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6950   switch(opcode) {
6951     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
6952     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
6953     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
6954     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
6955     default: assert(false, "%s", NodeClassNames[opcode]); break;
6956   }
6957 }
6958 
6959 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6960                                             KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) {
6961   vector_max_min_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit);
6962 }
6963 
6964 void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6965                                             KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6966   if (opcode == Op_MaxVHF || opcode == Op_MaxHF) {
6967     // Move sign bits of src2 to mask register.
6968     evpmovw2m(ktmp, src2, vlen_enc);
6969     // xtmp1 = src2 < 0 ? src2 : src1
6970     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
6971     // xtmp2 = src2 < 0 ? ? src1 : src2
6972     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
6973     // Idea behind above swapping is to make seconds source operand a +ve value.
6974     // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in
6975     // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction,
6976     // the second source operand, either a NaN or a valid floating-point value, is returned
6977     // dst = max(xtmp1, xtmp2)
6978     evmaxph(dst, xtmp1, xtmp2, vlen_enc);
6979     // isNaN = is_unordered_quiet(xtmp1)
6980     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
6981     // Final result is same as first source if its a NaN value,
6982     // in case second operand holds a NaN value then as per above semantics
6983     // result is same as second operand.
6984     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
6985   } else {
6986     assert(opcode == Op_MinVHF || opcode == Op_MinHF, "");
6987     // Move sign bits of src1 to mask register.
6988     evpmovw2m(ktmp, src1, vlen_enc);
6989     // xtmp1 = src1 < 0 ? src2 : src1
6990     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
6991     // xtmp2 = src1 < 0 ? src1 : src2
6992     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
6993     // Idea behind above swapping is to make seconds source operand a -ve value.
6994     // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in
6995     // the second source operand is returned.
6996     // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN
6997     // or a valid floating-point value, is written to the result.
6998     // dst = min(xtmp1, xtmp2)
6999     evminph(dst, xtmp1, xtmp2, vlen_enc);
7000     // isNaN = is_unordered_quiet(xtmp1)
7001     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7002     // Final result is same as first source if its a NaN value,
7003     // in case second operand holds a NaN value then as per above semantics
7004     // result is same as second operand.
7005     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7006   }
7007 }